From 99ee88aeb64e822f66692b0c7456c0fb07bae8df Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 29 Jun 2012 17:02:28 +0000 Subject: [PATCH] Import pandas_0.8.0.orig.tar.gz [dgit import orig pandas_0.8.0.orig.tar.gz] --- .coveragerc | 26 + .gitignore | 25 + .travis.yml | 16 + LICENSE | 80 + MANIFEST.in | 25 + Makefile | 11 + NP_LICENSE.txt | 30 + README.rst | 125 + RELEASE.rst | 1752 +++++ TODO.rst | 60 + bench/alignment.py | 21 + bench/bench_dense_to_sparse.py | 15 + bench/bench_get_put_value.py | 48 + bench/bench_groupby.py | 61 + bench/bench_join_panel.py | 77 + bench/bench_khash_dict.py | 78 + bench/bench_merge.R | 161 + bench/bench_merge.py | 104 + bench/bench_merge_sqlite.py | 84 + bench/bench_pivot.R | 27 + bench/bench_pivot.py | 16 + bench/bench_sparse.py | 92 + bench/bench_take_indexing.py | 52 + bench/bench_unique.py | 264 + bench/better_unique.py | 76 + bench/duplicated.R | 22 + bench/io_roundtrip.py | 107 + bench/larry.py | 0 bench/serialize.py | 80 + bench/test.py | 65 + bench/zoo_bench.R | 71 + bench/zoo_bench.py | 35 + doc/data/baseball.csv | 101 + doc/data/iris.data | 152 + doc/data/mindex_ex.csv | 16 + doc/make.py | 212 + doc/plots/stats/moment_plots.py | 29 + doc/plots/stats/moments_ewma.py | 15 + doc/plots/stats/moments_ewmvol.py | 23 + doc/plots/stats/moments_expw.py | 33 + doc/plots/stats/moments_rolling.py | 24 + doc/plots/stats/moments_rolling_binary.py | 30 + doc/source/_static/stub | 0 doc/source/api.rst | 457 ++ doc/source/basics.rst | 994 +++ doc/source/comparison_with_r.rst | 38 + doc/source/computation.rst | 536 ++ doc/source/conf.py | 248 + doc/source/dsintro.rst | 788 ++ doc/source/faq.rst | 174 + doc/source/gotchas.rst | 243 + doc/source/groupby.rst | 606 ++ doc/source/index.rst | 132 + doc/source/indexing.rst | 1011 +++ doc/source/install.rst | 152 + doc/source/io.rst | 783 ++ doc/source/merging.rst | 613 ++ doc/source/missing_data.rst | 369 + doc/source/overview.rst | 121 + doc/source/r_interface.rst | 99 + doc/source/related.rst | 47 + doc/source/reshaping.rst | 357 + doc/source/sparse.rst | 135 + doc/source/themes/agogo/layout.html | 95 + doc/source/themes/agogo/static/agogo.css_t | 476 ++ doc/source/themes/agogo/static/bgfooter.png | Bin 0 -> 434 bytes doc/source/themes/agogo/static/bgtop.png | Bin 0 -> 430 bytes doc/source/themes/agogo/theme.conf | 19 + doc/source/timeseries.rst | 913 +++ doc/source/v0.4.x.txt | 77 + doc/source/v0.5.0.txt | 60 + doc/source/v0.6.0.txt | 100 + doc/source/v0.6.1.txt | 62 + doc/source/v0.7.0.txt | 308 + doc/source/v0.7.1.txt | 39 + doc/source/v0.7.2.txt | 38 + doc/source/v0.7.3.txt | 96 + doc/source/v0.8.0.txt | 274 + doc/source/visualization.rst | 343 + doc/source/whatsnew.rst | 36 + doc/sphinxext/LICENSE.txt | 97 + doc/sphinxext/MANIFEST.in | 2 + doc/sphinxext/README.txt | 52 + doc/sphinxext/__init__.py | 1 + doc/sphinxext/comment_eater.py | 158 + doc/sphinxext/compiler_unparse.py | 860 +++ doc/sphinxext/docscrape.py | 499 ++ doc/sphinxext/docscrape_sphinx.py | 226 + doc/sphinxext/ipython_console_highlighting.py | 114 + doc/sphinxext/ipython_directive.py | 909 +++ doc/sphinxext/numpydoc.py | 169 + doc/sphinxext/only_directives.py | 96 + doc/sphinxext/phantom_import.py | 162 + doc/sphinxext/plot_directive.py | 641 ++ doc/sphinxext/setup.py | 31 + doc/sphinxext/tests/test_docscrape.py | 545 ++ doc/sphinxext/traitsdoc.py | 140 + examples/data/SOURCES | 0 examples/finance.py | 83 + examples/regressions.py | 49 + ez_setup.py | 284 + fake_pyrex/Pyrex/Distutils/__init__.py | 1 + fake_pyrex/Pyrex/Distutils/build_ext.py | 1 + fake_pyrex/Pyrex/__init__.py | 1 + pandas/__init__.py | 39 + pandas/compat/__init__.py | 0 pandas/compat/scipy.py | 242 + pandas/core/__init__.py | 0 pandas/core/algorithms.py | 320 + pandas/core/api.py | 31 + pandas/core/categorical.py | 127 + pandas/core/common.py | 928 +++ pandas/core/daterange.py | 45 + pandas/core/datetools.py | 32 + pandas/core/format.py | 862 +++ pandas/core/frame.py | 4922 +++++++++++++ pandas/core/generic.py | 967 +++ pandas/core/groupby.py | 2171 ++++++ pandas/core/index.py | 2497 +++++++ pandas/core/indexing.py | 614 ++ pandas/core/internals.py | 1432 ++++ pandas/core/matrix.py | 1 + pandas/core/nanops.py | 431 ++ pandas/core/panel.py | 1363 ++++ pandas/core/reshape.py | 754 ++ pandas/core/series.py | 2866 ++++++++ pandas/core/sparse.py | 10 + pandas/core/strings.py | 31 + pandas/info.py | 20 + pandas/io/__init__.py | 0 pandas/io/data.py | 196 + pandas/io/date_converters.py | 49 + pandas/io/parsers.py | 1400 ++++ pandas/io/pytables.py | 1122 +++ pandas/io/sql.py | 229 + pandas/io/tests/__init__.py | 1 + pandas/io/tests/legacy.h5 | Bin 0 -> 14928 bytes pandas/io/tests/salary.table | 47 + pandas/io/tests/test.xls | Bin 0 -> 30720 bytes pandas/io/tests/test.xlsx | Bin 0 -> 44929 bytes pandas/io/tests/test1.csv | 8 + pandas/io/tests/test2.csv | 6 + pandas/io/tests/test2.xls | Bin 0 -> 5632 bytes pandas/io/tests/test3.xls | Bin 0 -> 23040 bytes pandas/io/tests/test_date_converters.py | 99 + pandas/io/tests/test_parsers.py | 1396 ++++ pandas/io/tests/test_pytables.py | 685 ++ pandas/io/tests/test_sql.py | 169 + pandas/io/tests/test_yahoo.py | 30 + pandas/rpy/__init__.py | 1 + pandas/rpy/base.py | 13 + pandas/rpy/common.py | 375 + pandas/rpy/mass.py | 4 + pandas/rpy/vars.py | 20 + pandas/sandbox/__init__.py | 0 pandas/sandbox/qtpandas.py | 127 + pandas/sandbox/stats/__init__.py | 0 pandas/sandbox/stats/rls.py | 137 + pandas/setup.py | 26 + pandas/sparse/__init__.py | 0 pandas/sparse/api.py | 7 + pandas/sparse/array.py | 435 ++ pandas/sparse/frame.py | 857 +++ pandas/sparse/list.py | 137 + pandas/sparse/panel.py | 496 ++ pandas/sparse/series.py | 544 ++ pandas/sparse/tests/__init__.py | 0 pandas/sparse/tests/test_array.py | 154 + pandas/sparse/tests/test_libsparse.py | 365 + pandas/sparse/tests/test_list.py | 103 + pandas/sparse/tests/test_sparse.py | 1524 ++++ pandas/src/cppsandbox.pyx | 15 + pandas/src/data_algos.pyx | 1 + pandas/src/datetime.pxd | 115 + pandas/src/datetime.pyx | 1295 ++++ pandas/src/datetime/np_datetime.c | 949 +++ pandas/src/datetime/np_datetime.h | 122 + pandas/src/datetime/np_datetime_strings.c | 1456 ++++ pandas/src/datetime/np_datetime_strings.h | 86 + pandas/src/datetime_helper.h | 6 + pandas/src/engines.pyx | 526 ++ pandas/src/generate_code.py | 1233 ++++ pandas/src/generated.pyx | 4017 ++++++++++ pandas/src/groupby.pyx | 1489 ++++ pandas/src/hashtable.pxd | 51 + pandas/src/hashtable.pyx | 1028 +++ pandas/src/inference.pyx | 765 ++ pandas/src/join.pyx | 267 + pandas/src/khash.h | 608 ++ pandas/src/khash.pxd | 104 + pandas/src/ktypes.h | 6 + pandas/src/kvec.h | 142 + pandas/src/moments.pyx | 726 ++ pandas/src/ms_inttypes.h | 305 + pandas/src/ms_stdint.h | 247 + pandas/src/numpy.pxd | 980 +++ pandas/src/numpy_helper.h | 163 + pandas/src/offsets.pyx | 363 + pandas/src/period.c | 1371 ++++ pandas/src/period.h | 159 + pandas/src/plib.pyx | 352 + pandas/src/properties.pyx | 73 + pandas/src/reduce.pyx | 365 + pandas/src/reindex.pyx | 139 + pandas/src/sandbox.pyx | 500 ++ pandas/src/skiplist.h | 281 + pandas/src/skiplist.pxd | 21 + pandas/src/skiplist.pyx | 153 + pandas/src/sparse.pyx | 1186 +++ pandas/src/stats.pyx | 550 ++ pandas/src/stdint.h | 10 + pandas/src/tseries.pyx | 709 ++ pandas/src/util.pxd | 64 + pandas/stats/__init__.py | 0 pandas/stats/api.py | 9 + pandas/stats/common.py | 49 + pandas/stats/fama_macbeth.py | 221 + pandas/stats/interface.py | 134 + pandas/stats/math.py | 123 + pandas/stats/misc.py | 289 + pandas/stats/moments.py | 462 ++ pandas/stats/ols.py | 1320 ++++ pandas/stats/plm.py | 794 ++ pandas/stats/tests/__init__.py | 1 + pandas/stats/tests/common.py | 156 + pandas/stats/tests/test_fama_macbeth.py | 61 + pandas/stats/tests/test_math.py | 67 + pandas/stats/tests/test_moments.py | 340 + pandas/stats/tests/test_ols.py | 833 +++ pandas/stats/tests/test_var.py | 191 + pandas/stats/var.py | 586 ++ pandas/tests/__init__.py | 1 + pandas/tests/data/iris.csv | 151 + pandas/tests/data/mindex_073.pickle | Bin 0 -> 670 bytes pandas/tests/data/multiindex_v1.pickle | 149 + pandas/tests/data/unicode_series.csv | 18 + pandas/tests/test_algos.py | 56 + pandas/tests/test_common.py | 296 + pandas/tests/test_factor.py | 119 + pandas/tests/test_format.py | 779 ++ pandas/tests/test_frame.py | 6515 +++++++++++++++++ pandas/tests/test_graphics.py | 395 + pandas/tests/test_groupby.py | 2051 ++++++ pandas/tests/test_index.py | 1572 ++++ pandas/tests/test_internals.py | 428 ++ pandas/tests/test_multilevel.py | 1415 ++++ pandas/tests/test_ndframe.py | 30 + pandas/tests/test_panel.py | 1336 ++++ pandas/tests/test_reshape.py | 128 + pandas/tests/test_series.py | 2978 ++++++++ pandas/tests/test_stats.py | 118 + pandas/tests/test_tseries.py | 639 ++ pandas/tools/__init__.py | 0 pandas/tools/describe.py | 16 + pandas/tools/merge.py | 1212 +++ pandas/tools/pivot.py | 300 + pandas/tools/plotting.py | 1555 ++++ pandas/tools/tests/__init__.py | 1 + pandas/tools/tests/test_merge.py | 1434 ++++ pandas/tools/tests/test_pivot.py | 326 + pandas/tools/tests/test_tile.py | 187 + pandas/tools/tests/test_tools.py | 21 + pandas/tools/tile.py | 218 + pandas/tools/util.py | 6 + pandas/tseries/__init__.py | 0 pandas/tseries/api.py | 11 + pandas/tseries/converter.py | 697 ++ pandas/tseries/frequencies.py | 1025 +++ pandas/tseries/index.py | 1373 ++++ pandas/tseries/interval.py | 35 + pandas/tseries/offsets.py | 1173 +++ pandas/tseries/period.py | 1076 +++ pandas/tseries/plotting.py | 181 + pandas/tseries/resample.py | 329 + pandas/tseries/tests/__init__.py | 0 .../tseries/tests/data/daterange_073.pickle | Bin 0 -> 650 bytes pandas/tseries/tests/data/frame.pickle | Bin 0 -> 1182 bytes pandas/tseries/tests/data/series.pickle | Bin 0 -> 646 bytes .../tests/data/series_daterange0.pickle | Bin 0 -> 357 bytes pandas/tseries/tests/test_cursor.py | 196 + pandas/tseries/tests/test_daterange.py | 304 + pandas/tseries/tests/test_frequencies.py | 209 + pandas/tseries/tests/test_offsets.py | 1403 ++++ pandas/tseries/tests/test_period.py | 1821 +++++ pandas/tseries/tests/test_plotting.py | 665 ++ pandas/tseries/tests/test_resample.py | 792 ++ pandas/tseries/tests/test_timeseries.py | 1882 +++++ pandas/tseries/tests/test_timezones.py | 508 ++ pandas/tseries/tests/test_util.py | 64 + pandas/tseries/tools.py | 259 + pandas/tseries/util.py | 85 + pandas/util/__init__.py | 0 pandas/util/clipboard.py | 110 + pandas/util/compat.py | 14 + pandas/util/counter.py | 290 + pandas/util/decorators.py | 171 + pandas/util/map.py | 69 + pandas/util/misc.py | 4 + pandas/util/py3compat.py | 37 + pandas/util/terminal.py | 108 + pandas/util/testing.py | 380 + scripts/bench_join.R | 50 + scripts/bench_join.py | 197 + scripts/bench_join_multi.py | 30 + scripts/bench_refactor.py | 46 + scripts/boxplot_test.py | 14 + scripts/count_code.sh | 1 + scripts/faster_xs.py | 16 + scripts/file_sizes.py | 198 + scripts/git-mrb | 82 + scripts/git_code_churn.py | 35 + scripts/groupby_sample.py | 49 + scripts/groupby_speed.py | 31 + scripts/groupby_test.py | 142 + scripts/hdfstore_panel_perf.py | 16 + scripts/leak.py | 12 + scripts/parser_magic.py | 67 + scripts/preepoch_test.py | 22 + scripts/roll_median_leak.py | 24 + scripts/runtests.py | 3 + scripts/test_py25.bat | 8 + scripts/test_py26.bat | 8 + scripts/test_py27.bat | 6 + scripts/test_py31.bat | 8 + scripts/test_py32.bat | 8 + scripts/testmed.py | 161 + scripts/winbuild_py25.bat | 2 + scripts/winbuild_py27.bat | 2 + setup.py | 451 ++ test.sh | 10 + test_fast.sh | 1 + test_rebuild.sh | 12 + tox.ini | 39 + ts_todo.txt | 20 + vb_suite/.gitignore | 4 + vb_suite/attrs_caching.py | 20 + vb_suite/binary_ops.py | 26 + vb_suite/ctors.py | 17 + vb_suite/frame_ctor.py | 52 + vb_suite/frame_methods.py | 67 + vb_suite/generate_rst_files.py | 2 + vb_suite/groupby.py | 194 + vb_suite/index_object.py | 35 + vb_suite/indexing.py | 109 + vb_suite/io_bench.py | 46 + vb_suite/join_merge.py | 169 + vb_suite/make.py | 155 + vb_suite/miscellaneous.py | 34 + vb_suite/pandas_vb_common.py | 18 + vb_suite/panel_ctor.py | 74 + vb_suite/parser.py | 91 + vb_suite/reindex.py | 179 + vb_suite/replace.py | 28 + vb_suite/reshape.py | 34 + vb_suite/run_suite.py | 13 + vb_suite/source/_static/stub | 0 vb_suite/source/conf.py | 224 + vb_suite/source/themes/agogo/layout.html | 95 + .../source/themes/agogo/static/agogo.css_t | 476 ++ .../source/themes/agogo/static/bgfooter.png | Bin 0 -> 434 bytes vb_suite/source/themes/agogo/static/bgtop.png | Bin 0 -> 430 bytes vb_suite/source/themes/agogo/theme.conf | 19 + vb_suite/sparse.py | 29 + vb_suite/stat_ops.py | 75 + vb_suite/suite.py | 153 + vb_suite/test.py | 64 + vb_suite/timeseries.py | 134 + 367 files changed, 124530 insertions(+) create mode 100644 .coveragerc create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 MANIFEST.in create mode 100644 Makefile create mode 100644 NP_LICENSE.txt create mode 100644 README.rst create mode 100644 RELEASE.rst create mode 100644 TODO.rst create mode 100644 bench/alignment.py create mode 100644 bench/bench_dense_to_sparse.py create mode 100644 bench/bench_get_put_value.py create mode 100644 bench/bench_groupby.py create mode 100644 bench/bench_join_panel.py create mode 100644 bench/bench_khash_dict.py create mode 100644 bench/bench_merge.R create mode 100644 bench/bench_merge.py create mode 100644 bench/bench_merge_sqlite.py create mode 100644 bench/bench_pivot.R create mode 100644 bench/bench_pivot.py create mode 100644 bench/bench_sparse.py create mode 100644 bench/bench_take_indexing.py create mode 100644 bench/bench_unique.py create mode 100644 bench/better_unique.py create mode 100644 bench/duplicated.R create mode 100644 bench/io_roundtrip.py create mode 100644 bench/larry.py create mode 100644 bench/serialize.py create mode 100644 bench/test.py create mode 100644 bench/zoo_bench.R create mode 100644 bench/zoo_bench.py create mode 100644 doc/data/baseball.csv create mode 100644 doc/data/iris.data create mode 100644 doc/data/mindex_ex.csv create mode 100755 doc/make.py create mode 100644 doc/plots/stats/moment_plots.py create mode 100644 doc/plots/stats/moments_ewma.py create mode 100644 doc/plots/stats/moments_ewmvol.py create mode 100644 doc/plots/stats/moments_expw.py create mode 100644 doc/plots/stats/moments_rolling.py create mode 100644 doc/plots/stats/moments_rolling_binary.py create mode 100644 doc/source/_static/stub create mode 100644 doc/source/api.rst create mode 100644 doc/source/basics.rst create mode 100644 doc/source/comparison_with_r.rst create mode 100644 doc/source/computation.rst create mode 100644 doc/source/conf.py create mode 100644 doc/source/dsintro.rst create mode 100644 doc/source/faq.rst create mode 100644 doc/source/gotchas.rst create mode 100644 doc/source/groupby.rst create mode 100644 doc/source/index.rst create mode 100644 doc/source/indexing.rst create mode 100644 doc/source/install.rst create mode 100644 doc/source/io.rst create mode 100644 doc/source/merging.rst create mode 100644 doc/source/missing_data.rst create mode 100644 doc/source/overview.rst create mode 100644 doc/source/r_interface.rst create mode 100644 doc/source/related.rst create mode 100644 doc/source/reshaping.rst create mode 100644 doc/source/sparse.rst create mode 100644 doc/source/themes/agogo/layout.html create mode 100644 doc/source/themes/agogo/static/agogo.css_t create mode 100644 doc/source/themes/agogo/static/bgfooter.png create mode 100644 doc/source/themes/agogo/static/bgtop.png create mode 100644 doc/source/themes/agogo/theme.conf create mode 100644 doc/source/timeseries.rst create mode 100644 doc/source/v0.4.x.txt create mode 100644 doc/source/v0.5.0.txt create mode 100644 doc/source/v0.6.0.txt create mode 100644 doc/source/v0.6.1.txt create mode 100644 doc/source/v0.7.0.txt create mode 100644 doc/source/v0.7.1.txt create mode 100644 doc/source/v0.7.2.txt create mode 100644 doc/source/v0.7.3.txt create mode 100644 doc/source/v0.8.0.txt create mode 100644 doc/source/visualization.rst create mode 100644 doc/source/whatsnew.rst create mode 100755 doc/sphinxext/LICENSE.txt create mode 100755 doc/sphinxext/MANIFEST.in create mode 100755 doc/sphinxext/README.txt create mode 100755 doc/sphinxext/__init__.py create mode 100755 doc/sphinxext/comment_eater.py create mode 100755 doc/sphinxext/compiler_unparse.py create mode 100755 doc/sphinxext/docscrape.py create mode 100755 doc/sphinxext/docscrape_sphinx.py create mode 100644 doc/sphinxext/ipython_console_highlighting.py create mode 100644 doc/sphinxext/ipython_directive.py create mode 100755 doc/sphinxext/numpydoc.py create mode 100755 doc/sphinxext/only_directives.py create mode 100755 doc/sphinxext/phantom_import.py create mode 100755 doc/sphinxext/plot_directive.py create mode 100755 doc/sphinxext/setup.py create mode 100755 doc/sphinxext/tests/test_docscrape.py create mode 100755 doc/sphinxext/traitsdoc.py create mode 100644 examples/data/SOURCES create mode 100644 examples/finance.py create mode 100644 examples/regressions.py create mode 100644 ez_setup.py create mode 100644 fake_pyrex/Pyrex/Distutils/__init__.py create mode 100644 fake_pyrex/Pyrex/Distutils/build_ext.py create mode 100644 fake_pyrex/Pyrex/__init__.py create mode 100644 pandas/__init__.py create mode 100644 pandas/compat/__init__.py create mode 100644 pandas/compat/scipy.py create mode 100644 pandas/core/__init__.py create mode 100644 pandas/core/algorithms.py create mode 100644 pandas/core/api.py create mode 100644 pandas/core/categorical.py create mode 100644 pandas/core/common.py create mode 100644 pandas/core/daterange.py create mode 100644 pandas/core/datetools.py create mode 100644 pandas/core/format.py create mode 100644 pandas/core/frame.py create mode 100644 pandas/core/generic.py create mode 100644 pandas/core/groupby.py create mode 100644 pandas/core/index.py create mode 100644 pandas/core/indexing.py create mode 100644 pandas/core/internals.py create mode 100644 pandas/core/matrix.py create mode 100644 pandas/core/nanops.py create mode 100644 pandas/core/panel.py create mode 100644 pandas/core/reshape.py create mode 100644 pandas/core/series.py create mode 100644 pandas/core/sparse.py create mode 100644 pandas/core/strings.py create mode 100644 pandas/info.py create mode 100644 pandas/io/__init__.py create mode 100644 pandas/io/data.py create mode 100644 pandas/io/date_converters.py create mode 100644 pandas/io/parsers.py create mode 100644 pandas/io/pytables.py create mode 100644 pandas/io/sql.py create mode 100644 pandas/io/tests/__init__.py create mode 100644 pandas/io/tests/legacy.h5 create mode 100644 pandas/io/tests/salary.table create mode 100644 pandas/io/tests/test.xls create mode 100644 pandas/io/tests/test.xlsx create mode 100644 pandas/io/tests/test1.csv create mode 100644 pandas/io/tests/test2.csv create mode 100644 pandas/io/tests/test2.xls create mode 100644 pandas/io/tests/test3.xls create mode 100644 pandas/io/tests/test_date_converters.py create mode 100644 pandas/io/tests/test_parsers.py create mode 100644 pandas/io/tests/test_pytables.py create mode 100644 pandas/io/tests/test_sql.py create mode 100644 pandas/io/tests/test_yahoo.py create mode 100644 pandas/rpy/__init__.py create mode 100644 pandas/rpy/base.py create mode 100644 pandas/rpy/common.py create mode 100644 pandas/rpy/mass.py create mode 100644 pandas/rpy/vars.py create mode 100644 pandas/sandbox/__init__.py create mode 100644 pandas/sandbox/qtpandas.py create mode 100644 pandas/sandbox/stats/__init__.py create mode 100644 pandas/sandbox/stats/rls.py create mode 100644 pandas/setup.py create mode 100644 pandas/sparse/__init__.py create mode 100644 pandas/sparse/api.py create mode 100644 pandas/sparse/array.py create mode 100644 pandas/sparse/frame.py create mode 100644 pandas/sparse/list.py create mode 100644 pandas/sparse/panel.py create mode 100644 pandas/sparse/series.py create mode 100644 pandas/sparse/tests/__init__.py create mode 100644 pandas/sparse/tests/test_array.py create mode 100644 pandas/sparse/tests/test_libsparse.py create mode 100644 pandas/sparse/tests/test_list.py create mode 100644 pandas/sparse/tests/test_sparse.py create mode 100644 pandas/src/cppsandbox.pyx create mode 100644 pandas/src/data_algos.pyx create mode 100644 pandas/src/datetime.pxd create mode 100644 pandas/src/datetime.pyx create mode 100644 pandas/src/datetime/np_datetime.c create mode 100644 pandas/src/datetime/np_datetime.h create mode 100644 pandas/src/datetime/np_datetime_strings.c create mode 100644 pandas/src/datetime/np_datetime_strings.h create mode 100644 pandas/src/datetime_helper.h create mode 100644 pandas/src/engines.pyx create mode 100644 pandas/src/generate_code.py create mode 100644 pandas/src/generated.pyx create mode 100644 pandas/src/groupby.pyx create mode 100644 pandas/src/hashtable.pxd create mode 100644 pandas/src/hashtable.pyx create mode 100644 pandas/src/inference.pyx create mode 100644 pandas/src/join.pyx create mode 100644 pandas/src/khash.h create mode 100644 pandas/src/khash.pxd create mode 100644 pandas/src/ktypes.h create mode 100644 pandas/src/kvec.h create mode 100644 pandas/src/moments.pyx create mode 100644 pandas/src/ms_inttypes.h create mode 100644 pandas/src/ms_stdint.h create mode 100644 pandas/src/numpy.pxd create mode 100644 pandas/src/numpy_helper.h create mode 100644 pandas/src/offsets.pyx create mode 100644 pandas/src/period.c create mode 100644 pandas/src/period.h create mode 100644 pandas/src/plib.pyx create mode 100644 pandas/src/properties.pyx create mode 100644 pandas/src/reduce.pyx create mode 100644 pandas/src/reindex.pyx create mode 100644 pandas/src/sandbox.pyx create mode 100644 pandas/src/skiplist.h create mode 100644 pandas/src/skiplist.pxd create mode 100644 pandas/src/skiplist.pyx create mode 100644 pandas/src/sparse.pyx create mode 100644 pandas/src/stats.pyx create mode 100644 pandas/src/stdint.h create mode 100644 pandas/src/tseries.pyx create mode 100644 pandas/src/util.pxd create mode 100644 pandas/stats/__init__.py create mode 100644 pandas/stats/api.py create mode 100644 pandas/stats/common.py create mode 100644 pandas/stats/fama_macbeth.py create mode 100644 pandas/stats/interface.py create mode 100644 pandas/stats/math.py create mode 100644 pandas/stats/misc.py create mode 100644 pandas/stats/moments.py create mode 100644 pandas/stats/ols.py create mode 100644 pandas/stats/plm.py create mode 100644 pandas/stats/tests/__init__.py create mode 100644 pandas/stats/tests/common.py create mode 100644 pandas/stats/tests/test_fama_macbeth.py create mode 100644 pandas/stats/tests/test_math.py create mode 100644 pandas/stats/tests/test_moments.py create mode 100644 pandas/stats/tests/test_ols.py create mode 100644 pandas/stats/tests/test_var.py create mode 100644 pandas/stats/var.py create mode 100644 pandas/tests/__init__.py create mode 100644 pandas/tests/data/iris.csv create mode 100644 pandas/tests/data/mindex_073.pickle create mode 100644 pandas/tests/data/multiindex_v1.pickle create mode 100644 pandas/tests/data/unicode_series.csv create mode 100644 pandas/tests/test_algos.py create mode 100644 pandas/tests/test_common.py create mode 100644 pandas/tests/test_factor.py create mode 100644 pandas/tests/test_format.py create mode 100644 pandas/tests/test_frame.py create mode 100644 pandas/tests/test_graphics.py create mode 100644 pandas/tests/test_groupby.py create mode 100644 pandas/tests/test_index.py create mode 100644 pandas/tests/test_internals.py create mode 100644 pandas/tests/test_multilevel.py create mode 100644 pandas/tests/test_ndframe.py create mode 100644 pandas/tests/test_panel.py create mode 100644 pandas/tests/test_reshape.py create mode 100644 pandas/tests/test_series.py create mode 100644 pandas/tests/test_stats.py create mode 100644 pandas/tests/test_tseries.py create mode 100644 pandas/tools/__init__.py create mode 100644 pandas/tools/describe.py create mode 100644 pandas/tools/merge.py create mode 100644 pandas/tools/pivot.py create mode 100644 pandas/tools/plotting.py create mode 100644 pandas/tools/tests/__init__.py create mode 100644 pandas/tools/tests/test_merge.py create mode 100644 pandas/tools/tests/test_pivot.py create mode 100644 pandas/tools/tests/test_tile.py create mode 100644 pandas/tools/tests/test_tools.py create mode 100644 pandas/tools/tile.py create mode 100644 pandas/tools/util.py create mode 100644 pandas/tseries/__init__.py create mode 100644 pandas/tseries/api.py create mode 100644 pandas/tseries/converter.py create mode 100644 pandas/tseries/frequencies.py create mode 100644 pandas/tseries/index.py create mode 100644 pandas/tseries/interval.py create mode 100644 pandas/tseries/offsets.py create mode 100644 pandas/tseries/period.py create mode 100644 pandas/tseries/plotting.py create mode 100644 pandas/tseries/resample.py create mode 100644 pandas/tseries/tests/__init__.py create mode 100644 pandas/tseries/tests/data/daterange_073.pickle create mode 100644 pandas/tseries/tests/data/frame.pickle create mode 100644 pandas/tseries/tests/data/series.pickle create mode 100644 pandas/tseries/tests/data/series_daterange0.pickle create mode 100644 pandas/tseries/tests/test_cursor.py create mode 100644 pandas/tseries/tests/test_daterange.py create mode 100644 pandas/tseries/tests/test_frequencies.py create mode 100644 pandas/tseries/tests/test_offsets.py create mode 100644 pandas/tseries/tests/test_period.py create mode 100644 pandas/tseries/tests/test_plotting.py create mode 100644 pandas/tseries/tests/test_resample.py create mode 100644 pandas/tseries/tests/test_timeseries.py create mode 100644 pandas/tseries/tests/test_timezones.py create mode 100644 pandas/tseries/tests/test_util.py create mode 100644 pandas/tseries/tools.py create mode 100644 pandas/tseries/util.py create mode 100644 pandas/util/__init__.py create mode 100644 pandas/util/clipboard.py create mode 100644 pandas/util/compat.py create mode 100644 pandas/util/counter.py create mode 100644 pandas/util/decorators.py create mode 100644 pandas/util/map.py create mode 100644 pandas/util/misc.py create mode 100644 pandas/util/py3compat.py create mode 100644 pandas/util/terminal.py create mode 100644 pandas/util/testing.py create mode 100644 scripts/bench_join.R create mode 100644 scripts/bench_join.py create mode 100644 scripts/bench_join_multi.py create mode 100644 scripts/bench_refactor.py create mode 100644 scripts/boxplot_test.py create mode 100755 scripts/count_code.sh create mode 100644 scripts/faster_xs.py create mode 100644 scripts/file_sizes.py create mode 100644 scripts/git-mrb create mode 100644 scripts/git_code_churn.py create mode 100644 scripts/groupby_sample.py create mode 100644 scripts/groupby_speed.py create mode 100644 scripts/groupby_test.py create mode 100644 scripts/hdfstore_panel_perf.py create mode 100644 scripts/leak.py create mode 100644 scripts/parser_magic.py create mode 100644 scripts/preepoch_test.py create mode 100644 scripts/roll_median_leak.py create mode 100644 scripts/runtests.py create mode 100644 scripts/test_py25.bat create mode 100644 scripts/test_py26.bat create mode 100644 scripts/test_py27.bat create mode 100644 scripts/test_py31.bat create mode 100644 scripts/test_py32.bat create mode 100644 scripts/testmed.py create mode 100644 scripts/winbuild_py25.bat create mode 100644 scripts/winbuild_py27.bat create mode 100755 setup.py create mode 100755 test.sh create mode 100755 test_fast.sh create mode 100755 test_rebuild.sh create mode 100644 tox.ini create mode 100644 ts_todo.txt create mode 100644 vb_suite/.gitignore create mode 100644 vb_suite/attrs_caching.py create mode 100644 vb_suite/binary_ops.py create mode 100644 vb_suite/ctors.py create mode 100644 vb_suite/frame_ctor.py create mode 100644 vb_suite/frame_methods.py create mode 100644 vb_suite/generate_rst_files.py create mode 100644 vb_suite/groupby.py create mode 100644 vb_suite/index_object.py create mode 100644 vb_suite/indexing.py create mode 100644 vb_suite/io_bench.py create mode 100644 vb_suite/join_merge.py create mode 100755 vb_suite/make.py create mode 100644 vb_suite/miscellaneous.py create mode 100644 vb_suite/pandas_vb_common.py create mode 100644 vb_suite/panel_ctor.py create mode 100644 vb_suite/parser.py create mode 100644 vb_suite/reindex.py create mode 100644 vb_suite/replace.py create mode 100644 vb_suite/reshape.py create mode 100644 vb_suite/run_suite.py create mode 100644 vb_suite/source/_static/stub create mode 100644 vb_suite/source/conf.py create mode 100644 vb_suite/source/themes/agogo/layout.html create mode 100644 vb_suite/source/themes/agogo/static/agogo.css_t create mode 100644 vb_suite/source/themes/agogo/static/bgfooter.png create mode 100644 vb_suite/source/themes/agogo/static/bgtop.png create mode 100644 vb_suite/source/themes/agogo/theme.conf create mode 100644 vb_suite/sparse.py create mode 100644 vb_suite/stat_ops.py create mode 100644 vb_suite/suite.py create mode 100644 vb_suite/test.py create mode 100644 vb_suite/timeseries.py diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 00000000..5b264a62 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,26 @@ +# .coveragerc to control coverage.py +[run] +branch = False + +[report] +# Regexes for lines to exclude from consideration +exclude_lines = + # Have to re-enable the standard pragma + pragma: no cover + + # Don't complain about missing debug-only code: + def __repr__ + if self\.debug + + # Don't complain if tests don't hit defensive assertion code: + raise AssertionError + raise NotImplementedError + + # Don't complain if non-runnable code isn't run: + if 0: + if __name__ == .__main__.: + +ignore_errors = False + +[html] +directory = coverage_html_report \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..d17c869c --- /dev/null +++ b/.gitignore @@ -0,0 +1,25 @@ +*.pyc +*.pyo +*.swp +build +dist +MANIFEST +*.c +!np_datetime.c +!np_datetime_strings.c +!skts.c +*.cpp +*.so +*.pyd +pandas/version.py +doc/source/generated +doc/source/_static +doc/source/vbench +doc/source/vbench.rst +doc/build/html/index.html +*flymake* +scikits +.coverage +pandas.egg-info +*\#*\# +.tox diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..0c037b5b --- /dev/null +++ b/.travis.yml @@ -0,0 +1,16 @@ +language: python + +python: + - 2.5 + - 2.6 + - 2.7 + - 3.1 + - 3.2 + +install: + - "if [[ $TRAVIS_PYTHON_VERSION == '2.5' ]]; then pip install --use-mirrors simplejson; fi" + - pip install --use-mirrors cython numpy nose pytz + +script: + - python setup.py build_ext install + - nosetests --exe -w /tmp pandas.tests diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f0694db1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,80 @@ +====================== +PANDAS LICENSING TERMS +====================== + +pandas is licensed under the BSD 3-Clause (also known as "BSD New" or +"BSD Simplified"), as follows: + +Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2008-2011 AQR Capital Management, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +About the Copyright Holders +=========================== + +AQR Capital Management began pandas development in 2008. Development was +led by Wes McKinney. AQR released the source under this license in 2009. +Wes is now an employee of Lambda Foundry, and remains the pandas project +lead. + +The PyData Development Team is the collection of developers of the PyData +project. This includes all of the PyData sub-projects, including pandas. The +core team that coordinates development on GitHub can be found here: +http://github.com/pydata. + +Full credits for pandas contributors can be found in the documentation. + +Our Copyright Policy +==================== + +PyData uses a shared copyright model. Each contributor maintains copyright +over their contributions to PyData. However, it is important to note that +these contributions are typically only changes to the repositories. Thus, +the PyData source code, in its entirety, is not the copyright of any single +person or institution. Instead, it is the collective copyright of the +entire PyData Development Team. If individual contributors want to maintain +a record of what changes/contributions they have specific copyright on, +they should indicate their copyright in the commit message of the change +when they commit the change to one of the PyData repositories. + +With this in mind, the following banner should be used in any source code +file to indicate the copyright and license terms: + +#----------------------------------------------------------------------------- +# Copyright (c) 2012, PyData Development Team +# All rights reserved. +# +# Distributed under the terms of the BSD Simplified License. +# +# The full license is in the LICENSE file, distributed with this software. +#----------------------------------------------------------------------------- \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..8e157bfe --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,25 @@ +include MANIFEST.in +include LICENSE +include RELEASE.rst +include README.rst +include TODO.rst +include setup.py +include setupegg.py + +graft doc +prune doc/build + +graft examples +graft pandas + +global-exclude *.so +global-exclude *.pyd +global-exclude *.pyc +global-exclude .git* +global-exclude .DS_Store +global-exclude *.png + +# include examples/data/* +# recursive-include examples *.py +# recursive-include doc/source * +# recursive-include doc/sphinxext * diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..a4861c14 --- /dev/null +++ b/Makefile @@ -0,0 +1,11 @@ +clean: + -rm -rf build dist + +tseries: pandas/src/tseries.pyx + python setup.py build_ext --inplace + +sparse: pandas/src/sparse.pyx + -python setup.py build_ext --inplace + +test: sparse + -python pandas/tests/test_libsparse.py \ No newline at end of file diff --git a/NP_LICENSE.txt b/NP_LICENSE.txt new file mode 100644 index 00000000..7e972cff --- /dev/null +++ b/NP_LICENSE.txt @@ -0,0 +1,30 @@ +Copyright (c) 2005-2011, NumPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.rst b/README.rst new file mode 100644 index 00000000..d7a52a6c --- /dev/null +++ b/README.rst @@ -0,0 +1,125 @@ +============================================= +pandas: powerful Python data analysis toolkit +============================================= + +What is it +========== + +**pandas** is a Python package providing fast, flexible, and expressive data +structures designed to make working with "relational" or "labeled" data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, **real world** data analysis in Python. Additionally, it has +the broader goal of becoming **the most powerful and flexible open source data +analysis / manipulation tool available in any language**. It is already well on +its way toward this goal. + +Main Features +============= + +Here are just a few of the things that pandas does well: + + - Easy handling of **missing data** (represented as NaN) in floating point as + well as non-floating point data + - Size mutability: columns can be **inserted and deleted** from DataFrame and + higher dimensional objects + - Automatic and explicit **data alignment**: objects can be explicitly + aligned to a set of labels, or the user can simply ignore the labels and + let `Series`, `DataFrame`, etc. automatically align the data for you in + computations + - Powerful, flexible **group by** functionality to perform + split-apply-combine operations on data sets, for both aggregating and + transforming data + - Make it **easy to convert** ragged, differently-indexed data in other + Python and NumPy data structures into DataFrame objects + - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** + of large data sets + - Intuitive **merging** and **joining** data sets + - Flexible **reshaping** and pivoting of data sets + - **Hierarchical** labeling of axes (possible to have multiple labels per + tick) + - Robust IO tools for loading data from **flat files** (CSV and delimited), + Excel files, databases, and saving / loading data from the ultrafast **HDF5 + format** + - **Time series**-specific functionality: date range generation and frequency + conversion, moving window statistics, moving window linear regressions, + date shifting and lagging, etc. + +Where to get it +=============== + +The source code is currently hosted on GitHub at: http://github.com/pydata/pandas + +Binary installers for the latest released version are available at the Python +package index:: + + http://pypi.python.org/pypi/pandas/ + +And via ``easy_install`` or ``pip``:: + + easy_install pandas + pip install pandas + +Dependencies +============ + + * `NumPy `__: 1.6.1 or higher + * `python-dateutil `__ 1.5 + +Optional dependencies +~~~~~~~~~~~~~~~~~~~~~ + + * `Cython `__: Only necessary to build development + version + * `SciPy `__: miscellaneous statistical functions + * `PyTables `__: necessary for HDF5-based storage + * `matplotlib `__: for plotting + * `scikits.statsmodels `__ + * Needed for parts of :mod:`pandas.stats` + * `pytz `__ + * Needed for time zone support with ``DateRange`` + +Installation from sources +========================= + +In the ``pandas`` directory (same one where you found this file), execute:: + + python setup.py install + +On Windows, you will need to install MinGW and execute:: + + python setup.py build --compiler=mingw32 + python setup.py install + +See http://pandas.pydata.org/ for more information. + +License +======= + +BSD + +Documentation +============= + +The official documentation is hosted on PyData.org: http://pandas.pydata.org/ + +The Sphinx documentation should provide a good starting point for learning how +to use the library. Expect the docs to continue to expand as time goes on. + +Background +========== + +Work on ``pandas`` started at AQR (a quantitative hedge fund) in 2008 and +has been under active development since then. + +Discussion and Development +========================== + +Since ``pandas`` development is related to a number of other scientific +Python projects, questions are welcome on the scipy-user mailing +list. Specialized discussions or design issues should take place on +the pystatsmodels mailing list / Google group, where +``scikits.statsmodels`` and other libraries will also be discussed: + +http://groups.google.com/group/pystatsmodels + + .. _NumPy: http://numpy.scipy.org/ diff --git a/RELEASE.rst b/RELEASE.rst new file mode 100644 index 00000000..25ebda23 --- /dev/null +++ b/RELEASE.rst @@ -0,0 +1,1752 @@ +============= +Release Notes +============= + +This is the list of changes to pandas between each release. For full details, +see the commit logs at http://github.com/pydata/pandas + +What is it +---------- + +pandas is a Python package providing fast, flexible, and expressive data +structures designed to make working with “relational” or “labeled” data both +easy and intuitive. It aims to be the fundamental high-level building block for +doing practical, real world data analysis in Python. Additionally, it has the +broader goal of becoming the most powerful and flexible open source data +analysis / manipulation tool available in any language. + +Where to get it +--------------- + +* Source code: http://github.com/pydata/pandas +* Binary installers on PyPI: http://pypi.python.org/pypi/pandas +* Documentation: http://pandas.pydata.org + +pandas 0.8.0 +============ + +**Release date:** 6/29/2012 + +**New features** + + - New unified DatetimeIndex class for nanosecond-level timestamp data + - New Timestamp datetime.datetime subclass with easy time zone conversions, + and support for nanoseconds + - New PeriodIndex class for timespans, calendar logic, and Period scalar object + - High performance resampling of timestamp and period data. New `resample` + method of all pandas data structures + - New frequency names plus shortcut string aliases like '15h', '1h30min' + - Time series string indexing shorthand (#222) + - Add week, dayofyear array and other timestamp array-valued field accessor + functions to DatetimeIndex + - Add GroupBy.prod optimized aggregation function and 'prod' fast time series + conversion method (#1018) + - Implement robust frequency inference function and `inferred_freq` attribute + on DatetimeIndex (#391) + - New ``tz_convert`` and ``tz_localize`` methods in Series / DataFrame + - Convert DatetimeIndexes to UTC if time zones are different in join/setops + (#864) + - Add limit argument for forward/backward filling to reindex, fillna, + etc. (#825 and others) + - Add support for indexes (dates or otherwise) with duplicates and common + sense indexing/selection functionality + - Series/DataFrame.update methods, in-place variant of combine_first (#961) + - Add ``match`` function to API (#502) + - Add Cython-optimized first, last, min, max, prod functions to GroupBy (#994, + #1043) + - Dates can be split across multiple columns (#1227, #1186) + - Add experimental support for converting pandas DataFrame to R data.frame + via rpy2 (#350, #1212) + - Can pass list of (name, function) to GroupBy.aggregate to get aggregates in + a particular order (#610) + - Can pass dicts with lists of functions or dicts to GroupBy aggregate to do + much more flexible multiple function aggregation (#642, #610) + - New ordered_merge functions for merging DataFrames with ordered + data. Also supports group-wise merging for panel data (#813) + - Add keys() method to DataFrame + - Add flexible replace method for replacing potentially values to Series and + DataFrame (#929, #1241) + - Add 'kde' plot kind for Series/DataFrame.plot (#1059) + - More flexible multiple function aggregation with GroupBy + - Add pct_change function to Series/DataFrame + - Add option to interpolate by Index values in Series.interpolate (#1206) + - Add ``max_colwidth`` option for DataFrame, defaulting to 50 + - Conversion of DataFrame through rpy2 to R data.frame (#1282, ) + - Add keys() method on DataFrame (#1240) + - Add new ``match`` function to API (similar to R) (#502) + - Add dayfirst option to parsers (#854) + - Add ``method`` argument to ``align`` method for forward/backward fillin + (#216) + - Add Panel.transpose method for rearranging axes (#695) + - Add new ``cut`` function (patterned after R) for discretizing data into + equal range-length bins or arbitrary breaks of your choosing (#415) + - Add new ``qcut`` for cutting with quantiles (#1378) + - Add ``value_counts`` top level array method (#1392) + - Added Andrews curves plot tupe (#1325) + - Add lag plot (#1440) + - Add autocorrelation_plot (#1425) + - Add support for tox and Travis CI (#1382) + - Add support for Categorical use in GroupBy (#292) + - Add ``any`` and ``all`` methods to DataFrame (#1416) + - Add ``secondary_y`` option to Series.plot + - Add experimental ``lreshape`` function for reshaping wide to long + +**Improvements to existing features** + + - Switch to klib/khash-based hash tables in Index classes for better + performance in many cases and lower memory footprint + - Shipping some functions from scipy.stats to reduce dependency, + e.g. Series.describe and DataFrame.describe (GH #1092) + - Can create MultiIndex by passing list of lists or list of arrays to Series, + DataFrame constructor, etc. (#831) + - Can pass arrays in addition to column names to DataFrame.set_index (#402) + - Improve the speed of "square" reindexing of homogeneous DataFrame objects + by significant margin (#836) + - Handle more dtypes when passed MaskedArrays in DataFrame constructor (#406) + - Improved performance of join operations on integer keys (#682) + - Can pass multiple columns to GroupBy object, e.g. grouped[[col1, col2]] to + only aggregate a subset of the value columns (#383) + - Add histogram / kde plot options for scatter_matrix diagonals (#1237) + - Add inplace option to Series/DataFrame.rename and sort_index, + DataFrame.drop_duplicates (#805, #207) + - More helpful error message when nothing passed to Series.reindex (#1267) + - Can mix array and scalars as dict-value inputs to DataFrame ctor (#1329) + - Use DataFrame columns' name for legend title in plots + - Preserve frequency in DatetimeIndex when possible in boolean indexing + operations + - Promote datetime.date values in data alignment operations (#867) + - Add ``order`` method to Index classes (#1028) + - Avoid hash table creation in large monotonic hash table indexes (#1160) + - Store time zones in HDFStore (#1232) + - Enable storage of sparse data structures in HDFStore (#85) + - Enable Series.asof to work with arrays of timestamp inputs + - Cython implementation of DataFrame.corr speeds up by > 100x (#1349, #1354) + - Exclude "nuisance" columns automatically in GroupBy.transform (#1364) + - Support functions-as-strings in GroupBy.transform (#1362) + - Use index name as xlabel/ylabel in plots (#1415) + - Add ``convert_dtype`` option to Series.apply to be able to leave data as + dtype=object (#1414) + - Can specify all index level names in concat (#1419) + - Add ``dialect`` keyword to parsers for quoting conventions (#1363) + - Enable DataFrame[bool_DataFrame] += value (#1366) + - Add ``retries`` argument to ``get_data_yahoo`` to try to prevent Yahoo! API + 404s (#826) + - Improve performance of reshaping by using O(N) categorical sorting + - Series names will be used for index of DataFrame if no index passed (#1494) + - Header argument in DataFrame.to_csv can accept a list of column names to + use instead of the object's columns (#921) + - Add ``raise_conflict`` argument to DataFrame.update (#1526) + - Support file-like objects in ExcelFile (#1529) + +**API Changes** + + - Rename Factor to Categorical and add improvements. Numerous Categorical bug + fixes + - Frequency name overhaul, WEEKDAY/EOM and rules with @ + deprecated. get_legacy_offset_name backwards compatibility function added + - Raise ValueError in DataFrame.__nonzero__, so "if df" no longer works + (#1073) + - Change BDay (business day) to not normalize dates by default (#506) + - Remove deprecated DataMatrix name + - Default merge suffixes for overlap now have underscores instead of periods + to facilitate tab completion, etc. (#1239) + - Deprecation of offset, time_rule timeRule parameters throughout codebase + - Series.append and DataFrame.append no longer check for duplicate indexes + by default, add verify_integrity parameter (#1394) + - Refactor Factor class, old constructor moved to Factor.from_array + - Modified internals of MultiIndex to use less memory (no longer represented + as array of tuples) internally, speed up construction time and many methods + which construct intermediate hierarchical indexes (#1467) + +**Bug fixes** + + - Fix OverflowError from storing pre-1970 dates in HDFStore by switching to + datetime64 (GH #179) + - Fix logical error with February leap year end in YearEnd offset + - Series([False, nan]) was getting casted to float64 (GH #1074) + - Fix binary operations between boolean Series and object Series with + booleans and NAs (GH #1074, #1079) + - Couldn't assign whole array to column in mixed-type DataFrame via .ix + (#1142) + - Fix label slicing issues with float index values (#1167) + - Fix segfault caused by empty groups passed to groupby (#1048) + - Fix occasionally misbehaved reindexing in the presence of NaN labels (#522) + - Fix imprecise logic causing weird Series results from .apply (#1183) + - Unstack multiple levels in one shot, avoiding empty columns in some + cases. Fix pivot table bug (#1181) + - Fix formatting of MultiIndex on Series/DataFrame when index name coincides + with label (#1217) + - Handle Excel 2003 #N/A as NaN from xlrd (#1213, #1225) + - Fix timestamp locale-related deserialization issues with HDFStore by moving + to datetime64 representation (#1081, #809) + - Fix DataFrame.duplicated/drop_duplicates NA value handling (#557) + - Actually raise exceptions in fast reducer (#1243) + - Fix various timezone-handling bugs from 0.7.3 (#969) + - GroupBy on level=0 discarded index name (#1313) + - Better error message with unmergeable DataFrames (#1307) + - Series.__repr__ alignment fix with unicode index values (#1279) + - Better error message if nothing passed to reindex (#1267) + - More robust NA handling in DataFrame.drop_duplicates (#557) + - Resolve locale-based and pre-epoch HDF5 timestamp deserialization issues + (#973, #1081, #179) + - Implement Series.repeat (#1229) + - Fix indexing with namedtuple and other tuple subclasses (#1026) + - Fix float64 slicing bug (#1167) + - Parsing integers with commas (#796) + - Fix groupby improper data type when group consists of one value (#1065) + - Fix negative variance possibility in nanvar resulting from floating point + error (#1090) + - Consistently set name on groupby pieces (#184) + - Treat dict return values as Series in GroupBy.apply (#823) + - Respect column selection for DataFrame in in GroupBy.transform (#1365) + - Fix MultiIndex partial indexing bug (#1352) + - Enable assignment of rows in mixed-type DataFrame via .ix (#1432) + - Reset index mapping when grouping Series in Cython (#1423) + - Fix outer/inner DataFrame.join with non-unique indexes (#1421) + - Fix MultiIndex groupby bugs with empty lower levels (#1401) + - Calling fillna with a Series will have same behavior as with dict (#1486) + - SparseSeries reduction bug (#1375) + - Fix unicode serialization issue in HDFStore (#1361) + - Pass keywords to pyplot.boxplot in DataFrame.boxplot (#1493) + - Bug fixes in MonthBegin (#1483) + - Preserve MultiIndex names in drop (#1513) + - Fix Panel DataFrame slice-assignment bug (#1533) + - Don't use locals() in read_* functions (#1547) + +pandas 0.7.3 +============ + +**Release date:** April 12, 2012 + +**New features / modules** + + - Support for non-unique indexes: indexing and selection, many-to-one and + many-to-many joins (#1306) + - Added fixed-width file reader, read_fwf (PR #952) + - Add group_keys argument to groupby to not add group names to MultiIndex in + result of apply (GH #938) + - DataFrame can now accept non-integer label slicing (GH #946). Previously + only DataFrame.ix was able to do so. + - DataFrame.apply now retains name attributes on Series objects (GH #983) + - Numeric DataFrame comparisons with non-numeric values now raises proper + TypeError (GH #943). Previously raise "PandasError: DataFrame constructor + not properly called!" + - Add ``kurt`` methods to Series and DataFrame (PR #964) + - Can pass dict of column -> list/set NA values for text parsers (GH #754) + - Allows users specified NA values in text parsers (GH #754) + - Parsers checks for openpyxl dependency and raises ImportError if not found + (PR #1007) + - New factory function to create HDFStore objects that can be used in a with + statement so users do not have to explicitly call HDFStore.close (PR #1005) + - pivot_table is now more flexible with same parameters as groupby (GH #941) + - Added stacked bar plots (GH #987) + - scatter_matrix method in pandas/tools/plotting.py (PR #935) + - DataFrame.boxplot returns plot results for ex-post styling (GH #985) + - Short version number accessible as pandas.version.short_version (GH #930) + - Additional documentation in panel.to_frame (GH #942) + - More informative Series.apply docstring regarding element-wise apply + (GH #977) + - Notes on rpy2 installation (GH #1006) + - Add rotation and font size options to hist method (#1012) + - Use exogenous / X variable index in result of OLS.y_predict. Add + OLS.predict method (PR #1027, #1008) + +**API Changes** + + - Calling apply on grouped Series, e.g. describe(), will no longer yield + DataFrame by default. Will have to call unstack() to get prior behavior + - NA handling in non-numeric comparisons has been tightened up (#933, #953) + - No longer assign dummy names key_0, key_1, etc. to groupby index (#1291) + +**Bug fixes** + + - Fix logic error when selecting part of a row in a DataFrame with a + MultiIndex index (GH #1013) + - Series comparison with Series of differing length causes crash (GH #1016). + - Fix bug in indexing when selecting section of hierarchically-indexed row + (GH #1013) + - DataFrame.plot(logy=True) has no effect (GH #1011). + - Broken arithmetic operations between SparsePanel-Panel (GH #1015) + - Unicode repr issues in MultiIndex with non-ascii characters (GH #1010) + - DataFrame.lookup() returns inconsistent results if exact match not present + (GH #1001) + - DataFrame arithmetic operations not treating None as NA (GH #992) + - DataFrameGroupBy.apply returns incorrect result (GH #991) + - Series.reshape returns incorrect result for multiple dimensions (GH #989) + - Series.std and Series.var ignores ddof parameter (GH #934) + - DataFrame.append loses index names (GH #980) + - DataFrame.plot(kind='bar') ignores color argument (GH #958) + - Inconsistent Index comparison results (GH #948) + - Improper int dtype DataFrame construction from data with NaN (GH #846) + - Removes default 'result' name in grouby results (GH #995) + - DataFrame.from_records no longer mutate input columns (PR #975) + - Use Index name when grouping by it (#1313) + +pandas 0.7.2 +============ + +**Release date:** March 16, 2012 + +**New features / modules** + + - Add additional tie-breaking methods in DataFrame.rank (#874) + - Add ascending parameter to rank in Series, DataFrame (#875) + - Add coerce_float option to DataFrame.from_records (#893) + - Add sort_columns parameter to allow unsorted plots (#918) + - IPython tab completion on GroupBy objects + +**API Changes** + + - Series.sum returns 0 instead of NA when called on an empty + series. Analogously for a DataFrame whose rows or columns are length 0 + (#844) + +**Improvements to existing features** + + - Don't use groups dict in Grouper.size (#860) + - Use khash for Series.value_counts, add raw function to algorithms.py (#861) + - Enable column access via attributes on GroupBy (#882) + - Enable setting existing columns (only) via attributes on DataFrame, Panel + (#883) + - Intercept __builtin__.sum in groupby (#885) + - Can pass dict to DataFrame.fillna to use different values per column (#661) + - Can select multiple hierarchical groups by passing list of values in .ix + (#134) + - Add level keyword to ``drop`` for dropping values from a level (GH #159) + - Add ``coerce_float`` option on DataFrame.from_records (# 893) + - Raise exception if passed date_parser fails in ``read_csv`` + - Add ``axis`` option to DataFrame.fillna (#174) + - Fixes to Panel to make it easier to subclass (PR #888) + +**Bug fixes** + + - Fix overflow-related bugs in groupby (#850, #851) + - Fix unhelpful error message in parsers (#856) + - Better err msg for failed boolean slicing of dataframe (#859) + - Series.count cannot accept a string (level name) in the level argument (#869) + - Group index platform int check (#870) + - concat on axis=1 and ignore_index=True raises TypeError (#871) + - Further unicode handling issues resolved (#795) + - Fix failure in multiindex-based access in Panel (#880) + - Fix DataFrame boolean slice assignment failure (#881) + - Fix combineAdd NotImplementedError for SparseDataFrame (#887) + - Fix DataFrame.to_html encoding and columns (#890, #891, #909) + - Fix na-filling handling in mixed-type DataFrame (#910) + - Fix to DataFrame.set_value with non-existant row/col (#911) + - Fix malformed block in groupby when excluding nuisance columns (#916) + - Fix inconsistant NA handling in dtype=object arrays (#925) + - Fix missing center-of-mass computation in ewmcov (#862) + - Don't raise exception when opening read-only HDF5 file (#847) + - Fix possible out-of-bounds memory access in 0-length Series (#917) + +pandas 0.7.1 +============ + +**Release date:** February 29, 2012 + +**New features / modules** + + - Add ``to_clipboard`` function to pandas namespace for writing objects to + the system clipboard (#774) + - Add ``itertuples`` method to DataFrame for iterating through the rows of a + dataframe as tuples (#818) + - Add ability to pass fill_value and method to DataFrame and Series align + method (#806, #807) + - Add fill_value option to reindex, align methods (#784) + - Enable concat to produce DataFrame from Series (#787) + - Add ``between`` method to Series (#802) + - Add HTML representation hook to DataFrame for the IPython HTML notebook + (#773) + - Support for reading Excel 2007 XML documents using openpyxl + +**Improvements to existing features** + + - Improve performance and memory usage of fillna on DataFrame + - Can concatenate a list of Series along axis=1 to obtain a DataFrame (#787) + +**Bug fixes** + + - Fix memory leak when inserting large number of columns into a single + DataFrame (#790) + - Appending length-0 DataFrame with new columns would not result in those new + columns being part of the resulting concatenated DataFrame (#782) + - Fixed groupby corner case when passing dictionary grouper and as_index is + False (#819) + - Fixed bug whereby bool array sometimes had object dtype (#820) + - Fix exception thrown on np.diff (#816) + - Fix to_records where columns are non-strings (#822) + - Fix Index.intersection where indices have incomparable types (#811) + - Fix ExcelFile throwing an exception for two-line file (#837) + - Add clearer error message in csv parser (#835) + - Fix loss of fractional seconds in HDFStore (#513) + - Fix DataFrame join where columns have datetimes (#787) + - Work around numpy performance issue in take (#817) + - Improve comparison operations for NA-friendliness (#801) + - Fix indexing operation for floating point values (#780, #798) + - Fix groupby case resulting in malformed dataframe (#814) + - Fix behavior of reindex of Series dropping name (#812) + - Improve on redudant groupby computation (#775) + - Catch possible NA assignment to int/bool series with exception (#839) + +pandas 0.7.0 +============ + +**Release date:** 2/9/2012 + +**New features / modules** + + - New ``merge`` function for efficiently performing full gamut of database / + relational-algebra operations. Refactored existing join methods to use the + new infrastructure, resulting in substantial performance gains (GH #220, + #249, #267) + - New ``concat`` function for concatenating DataFrame or Panel objects along + an axis. Can form union or intersection of the other axes. Improves + performance of ``DataFrame.append`` (#468, #479, #273) + - Handle differently-indexed output values in ``DataFrame.apply`` (GH #498) + - Can pass list of dicts (e.g., a list of shallow JSON objects) to DataFrame + constructor (GH #526) + - Add ``reorder_levels`` method to Series and DataFrame (PR #534) + - Add dict-like ``get`` function to DataFrame and Panel (PR #521) + - ``DataFrame.iterrows`` method for efficiently iterating through the rows of + a DataFrame + - Added ``DataFrame.to_panel`` with code adapted from ``LongPanel.to_long`` + - ``reindex_axis`` method added to DataFrame + - Add ``level`` option to binary arithmetic functions on ``DataFrame`` and + ``Series`` + - Add ``level`` option to the ``reindex`` and ``align`` methods on Series and + DataFrame for broadcasting values across a level (GH #542, PR #552, others) + - Add attribute-based item access to ``Panel`` and add IPython completion (PR + #554) + - Add ``logy`` option to ``Series.plot`` for log-scaling on the Y axis + - Add ``index``, ``header``, and ``justify`` options to + ``DataFrame.to_string``. Add option to (GH #570, GH #571) + - Can pass multiple DataFrames to ``DataFrame.join`` to join on index (GH #115) + - Can pass multiple Panels to ``Panel.join`` (GH #115) + - Can pass multiple DataFrames to `DataFrame.append` to concatenate (stack) + and multiple Series to ``Series.append`` too + - Added ``justify`` argument to ``DataFrame.to_string`` to allow different + alignment of column headers + - Add ``sort`` option to GroupBy to allow disabling sorting of the group keys + for potential speedups (GH #595) + - Can pass MaskedArray to Series constructor (PR #563) + - Add Panel item access via attributes and IPython completion (GH #554) + - Implement ``DataFrame.lookup``, fancy-indexing analogue for retrieving + values given a sequence of row and column labels (GH #338) + - Add ``verbose`` option to ``read_csv`` and ``read_table`` to show number of + NA values inserted in non-numeric columns (GH #614) + - Can pass a list of dicts or Series to ``DataFrame.append`` to concatenate + multiple rows (GH #464) + - Add ``level`` argument to ``DataFrame.xs`` for selecting data from other + MultiIndex levels. Can take one or more levels with potentially a tuple of + keys for flexible retrieval of data (GH #371, GH #629) + - New ``crosstab`` function for easily computing frequency tables (GH #170) + - Can pass a list of functions to aggregate with groupby on a DataFrame, + yielding an aggregated result with hierarchical columns (GH #166) + - Add integer-indexing functions ``iget`` in Series and ``irow`` / ``iget`` + in DataFrame (GH #628) + - Add new ``Series.unique`` function, significantly faster than + ``numpy.unique`` (GH #658) + - Add new ``cummin`` and ``cummax`` instance methods to ``Series`` and + ``DataFrame`` (GH #647) + - Add new ``value_range`` function to return min/max of a dataframe (GH #288) + - Add ``drop`` parameter to ``reset_index`` method of ``DataFrame`` and added + method to ``Series`` as well (GH #699) + - Add ``isin`` method to Index objects, works just like ``Series.isin`` (GH + #657) + - Implement array interface on Panel so that ufuncs work (re: #740) + - Add ``sort`` option to ``DataFrame.join`` (GH #731) + - Improved handling of NAs (propagation) in binary operations with + dtype=object arrays (GH #737) + - Add ``abs`` method to Pandas objects + - Added ``algorithms`` module to start collecting central algos + +**API Changes** + + - Label-indexing with integer indexes now raises KeyError if a label is not + found instead of falling back on location-based indexing (GH #700) + - Label-based slicing via ``ix`` or ``[]`` on Series will now only work if + exact matches for the labels are found or if the index is monotonic (for + range selections) + - Label-based slicing and sequences of labels can be passed to ``[]`` on a + Series for both getting and setting (GH #86) + - `[]` operator (``__getitem__`` and ``__setitem__``) will raise KeyError + with integer indexes when an index is not contained in the index. The prior + behavior would fall back on position-based indexing if a key was not found + in the index which would lead to subtle bugs. This is now consistent with + the behavior of ``.ix`` on DataFrame and friends (GH #328) + - Rename ``DataFrame.delevel`` to ``DataFrame.reset_index`` and add + deprecation warning + - `Series.sort` (an in-place operation) called on a Series which is a view on + a larger array (e.g. a column in a DataFrame) will generate an Exception to + prevent accidentally modifying the data source (GH #316) + - Refactor to remove deprecated ``LongPanel`` class (PR #552) + - Deprecated ``Panel.to_long``, renamed to ``to_frame`` + - Deprecated ``colSpace`` argument in ``DataFrame.to_string``, renamed to + ``col_space`` + - Rename ``precision`` to ``accuracy`` in engineering float formatter (GH + #395) + - The default delimiter for ``read_csv`` is comma rather than letting + ``csv.Sniffer`` infer it + - Rename ``col_or_columns`` argument in ``DataFrame.drop_duplicates`` (GH + #734) + +**Improvements to existing features** + + - Better error message in DataFrame constructor when passed column labels + don't match data (GH #497) + - Substantially improve performance of multi-GroupBy aggregation when a + Python function is passed, reuse ndarray object in Cython (GH #496) + - Can store objects indexed by tuples and floats in HDFStore (GH #492) + - Don't print length by default in Series.to_string, add `length` option (GH + #489) + - Improve Cython code for multi-groupby to aggregate without having to sort + the data (GH #93) + - Improve MultiIndex reindexing speed by storing tuples in the MultiIndex, + test for backwards unpickling compatibility + - Improve column reindexing performance by using specialized Cython take + function + - Further performance tweaking of Series.__getitem__ for standard use cases + - Avoid Index dict creation in some cases (i.e. when getting slices, etc.), + regression from prior versions + - Friendlier error message in setup.py if NumPy not installed + - Use common set of NA-handling operations (sum, mean, etc.) in Panel class + also (GH #536) + - Default name assignment when calling ``reset_index`` on DataFrame with a + regular (non-hierarchical) index (GH #476) + - Use Cythonized groupers when possible in Series/DataFrame stat ops with + ``level`` parameter passed (GH #545) + - Ported skiplist data structure to C to speed up ``rolling_median`` by about + 5-10x in most typical use cases (GH #374) + - Some performance enhancements in constructing a Panel from a dict of + DataFrame objects + - Made ``Index._get_duplicates`` a public method by removing the underscore + - Prettier printing of floats, and column spacing fix (GH #395, GH #571) + - Add ``bold_rows`` option to DataFrame.to_html (GH #586) + - Improve the performance of ``DataFrame.sort_index`` by up to 5x or more + when sorting by multiple columns + - Substantially improve performance of DataFrame and Series constructors when + passed a nested dict or dict, respectively (GH #540, GH #621) + - Modified setup.py so that pip / setuptools will install dependencies (GH + #507, various pull requests) + - Unstack called on DataFrame with non-MultiIndex will return Series (GH + #477) + - Improve DataFrame.to_string and console formatting to be more consistent in + the number of displayed digits (GH #395) + - Use bottleneck if available for performing NaN-friendly statistical + operations that it implemented (GH #91) + - Monkey-patch context to traceback in ``DataFrame.apply`` to indicate which + row/column the function application failed on (GH #614) + - Improved ability of read_table and read_clipboard to parse + console-formatted DataFrames (can read the row of index names, etc.) + - Can pass list of group labels (without having to convert to an ndarray + yourself) to ``groupby`` in some cases (GH #659) + - Use ``kind`` argument to Series.order for selecting different sort kinds + (GH #668) + - Add option to Series.to_csv to omit the index (PR #684) + - Add ``delimiter`` as an alternative to ``sep`` in ``read_csv`` and other + parsing functions + - Substantially improved performance of groupby on DataFrames with many + columns by aggregating blocks of columns all at once (GH #745) + - Can pass a file handle or StringIO to Series/DataFrame.to_csv (GH #765) + - Can pass sequence of integers to DataFrame.irow(icol) and Series.iget, (GH + #654) + - Prototypes for some vectorized string functions + - Add float64 hash table to solve the Series.unique problem with NAs (GH #714) + - Memoize objects when reading from file to reduce memory footprint + - Can get and set a column of a DataFrame with hierarchical columns + containing "empty" ('') lower levels without passing the empty levels (PR + #768) + +**Bug fixes** + + - Raise exception in out-of-bounds indexing of Series instead of + seg-faulting, regression from earlier releases (GH #495) + - Fix error when joining DataFrames of different dtypes within the same + typeclass (e.g. float32 and float64) (GH #486) + - Fix bug in Series.min/Series.max on objects like datetime.datetime (GH + #487) + - Preserve index names in Index.union (GH #501) + - Fix bug in Index joining causing subclass information (like DateRange type) + to be lost in some cases (GH #500) + - Accept empty list as input to DataFrame constructor, regression from 0.6.0 + (GH #491) + - Can output DataFrame and Series with ndarray objects in a dtype=object + array (GH #490) + - Return empty string from Series.to_string when called on empty Series (GH + #488) + - Fix exception passing empty list to DataFrame.from_records + - Fix Index.format bug (excluding name field) with datetimes with time info + - Fix scalar value access in Series to always return NumPy scalars, + regression from prior versions (GH #510) + - Handle rows skipped at beginning of file in read_* functions (GH #505) + - Handle improper dtype casting in ``set_value`` methods + - Unary '-' / __neg__ operator on DataFrame was returning integer values + - Unbox 0-dim ndarrays from certain operators like all, any in Series + - Fix handling of missing columns (was combine_first-specific) in + DataFrame.combine for general case (GH #529) + - Fix type inference logic with boolean lists and arrays in DataFrame indexing + - Use centered sum of squares in R-square computation if entity_effects=True + in panel regression + - Handle all NA case in Series.{corr, cov}, was raising exception (GH #548) + - Aggregating by multiple levels with ``level`` argument to DataFrame, Series + stat method, was broken (GH #545) + - Fix Cython buf when converter passed to read_csv produced a numeric array + (buffer dtype mismatch when passed to Cython type inference function) (GH + #546) + - Fix exception when setting scalar value using .ix on a DataFrame with a + MultiIndex (GH #551) + - Fix outer join between two DateRanges with different offsets that returned + an invalid DateRange + - Cleanup DataFrame.from_records failure where index argument is an integer + - Fix Data.from_records failure when passed a dictionary + - Fix NA handling in {Series, DataFrame}.rank with non-floating point dtypes + - Fix bug related to integer type-checking in .ix-based indexing + - Handle non-string index name passed to DataFrame.from_records + - DataFrame.insert caused the columns name(s) field to be discarded (GH #527) + - Fix erroneous in monotonic many-to-one left joins + - Fix DataFrame.to_string to remove extra column white space (GH #571) + - Format floats to default to same number of digits (GH #395) + - Added decorator to copy docstring from one function to another (GH #449) + - Fix error in monotonic many-to-one left joins + - Fix __eq__ comparison between DateOffsets with different relativedelta + keywords passed + - Fix exception caused by parser converter returning strings (GH #583) + - Fix MultiIndex formatting bug with integer names (GH #601) + - Fix bug in handling of non-numeric aggregates in Series.groupby (GH #612) + - Fix TypeError with tuple subclasses (e.g. namedtuple) in + DataFrame.from_records (GH #611) + - Catch misreported console size when running IPython within Emacs + - Fix minor bug in pivot table margins, loss of index names and length-1 + 'All' tuple in row labels + - Add support for legacy WidePanel objects to be read from HDFStore + - Fix out-of-bounds segfault in pad_object and backfill_object methods when + either source or target array are empty + - Could not create a new column in a DataFrame from a list of tuples + - Fix bugs preventing SparseDataFrame and SparseSeries working with groupby + (GH #666) + - Use sort kind in Series.sort / argsort (GH #668) + - Fix DataFrame operations on non-scalar, non-pandas objects (GH #672) + - Don't convert DataFrame column to integer type when passing integer to + __setitem__ (GH #669) + - Fix downstream bug in pivot_table caused by integer level names in + MultiIndex (GH #678) + - Fix SparseSeries.combine_first when passed a dense Series (GH #687) + - Fix performance regression in HDFStore loading when DataFrame or Panel + stored in table format with datetimes + - Raise Exception in DateRange when offset with n=0 is passed (GH #683) + - Fix get/set inconsistency with .ix property and integer location but + non-integer index (GH #707) + - Use right dropna function for SparseSeries. Return dense Series for NA fill + value (GH #730) + - Fix Index.format bug causing incorrectly string-formatted Series with + datetime indexes (# 726, 758) + - Fix errors caused by object dtype arrays passed to ols (GH #759) + - Fix error where column names lost when passing list of labels to + DataFrame.__getitem__, (GH #662) + - Fix error whereby top-level week iterator overwrote week instance + - Fix circular reference causing memory leak in sparse array / series / + frame, (GH #663) + - Fix integer-slicing from integers-as-floats (GH #670) + - Fix zero division errors in nanops from object dtype arrays in all NA case + (GH #676) + - Fix csv encoding when using unicode (GH #705, #717, #738) + - Fix assumption that each object contains every unique block type in concat, + (GH #708) + - Fix sortedness check of multiindex in to_panel (GH #719, 720) + - Fix that None was not treated as NA in PyObjectHashtable + - Fix hashing dtype because of endianness confusion (GH #747, #748) + - Fix SparseSeries.dropna to return dense Series in case of NA fill value (GH + #730) + - Use map_infer instead of np.vectorize. handle NA sentinels if converter + yields numeric array, (GH #753) + - Fixes and improvements to DataFrame.rank (GH #742) + - Fix catching AttributeError instead of NameError for bottleneck + - Try to cast non-MultiIndex to better dtype when calling reset_index (GH #726 + #440) + - Fix #1.QNAN0' float bug on 2.6/win64 + - Allow subclasses of dicts in DataFrame constructor, with tests + - Fix problem whereby set_index destroys column multiindex (GH #764) + - Hack around bug in generating DateRange from naive DateOffset (GH #770) + - Fix bug in DateRange.intersection causing incorrect results with some + overlapping ranges (GH #771) + +Thanks +------ +- Craig Austin +- Chris Billington +- Marius Cobzarenco +- Mario Gamboa-Cavazos +- Hans-Martin Gaudecker +- Arthur Gerigk +- Yaroslav Halchenko +- Jeff Hammerbacher +- Matt Harrison +- Andreas Hilboll +- Luc Kesters +- Adam Klein +- Gregg Lind +- Solomon Negusse +- Wouter Overmeire +- Christian Prinoth +- Jeff Reback +- Sam Reckoner +- Craig Reeson +- Jan Schulz +- Skipper Seabold +- Ted Square +- Graham Taylor +- Aman Thakral +- Chris Uga +- Dieter Vandenbussche +- Texas P. +- Pinxing Ye +- ... and everyone I forgot + +pandas 0.6.1 +============ + +**Release date:** 12/13/2011 + +**API Changes** + + - Rename `names` argument in DataFrame.from_records to `columns`. Add + deprecation warning + - Boolean get/set operations on Series with boolean Series will reindex + instead of requiring that the indexes be exactly equal (GH #429) + +**New features / modules** + + - Can pass Series to DataFrame.append with ignore_index=True for appending a + single row (GH #430) + - Add Spearman and Kendall correlation options to Series.corr and + DataFrame.corr (GH #428) + - Add new `get_value` and `set_value` methods to Series, DataFrame, and Panel + to very low-overhead access to scalar elements. df.get_value(row, column) + is about 3x faster than df[column][row] by handling fewer cases (GH #437, + #438). Add similar methods to sparse data structures for compatibility + - Add Qt table widget to sandbox (PR #435) + - DataFrame.align can accept Series arguments, add axis keyword (GH #461) + - Implement new SparseList and SparseArray data structures. SparseSeries now + derives from SparseArray (GH #463) + - max_columns / max_rows options in set_printoptions (PR #453) + - Implement Series.rank and DataFrame.rank, fast versions of + scipy.stats.rankdata (GH #428) + - Implement DataFrame.from_items alternate constructor (GH #444) + - DataFrame.convert_objects method for inferring better dtypes for object + columns (GH #302) + - Add rolling_corr_pairwise function for computing Panel of correlation + matrices (GH #189) + - Add `margins` option to `pivot_table` for computing subgroup aggregates (GH + #114) + - Add `Series.from_csv` function (PR #482) + +**Improvements to existing features** + + - Improve memory usage of `DataFrame.describe` (do not copy data + unnecessarily) (PR #425) + - Use same formatting function for outputting floating point Series to console + as in DataFrame (PR #420) + - DataFrame.delevel will try to infer better dtype for new columns (GH #440) + - Exclude non-numeric types in DataFrame.{corr, cov} + - Override Index.astype to enable dtype casting (GH #412) + - Use same float formatting function for Series.__repr__ (PR #420) + - Use available console width to output DataFrame columns (PR #453) + - Accept ndarrays when setting items in Panel (GH #452) + - Infer console width when printing __repr__ of DataFrame to console (PR + #453) + - Optimize scalar value lookups in the general case by 25% or more in Series + and DataFrame + - Can pass DataFrame/DataFrame and DataFrame/Series to + rolling_corr/rolling_cov (GH #462) + - Fix performance regression in cross-sectional count in DataFrame, affecting + DataFrame.dropna speed + - Column deletion in DataFrame copies no data (computes views on blocks) (GH + #158) + - MultiIndex.get_level_values can take the level name + - More helpful error message when DataFrame.plot fails on one of the columns + (GH #478) + - Improve performance of DataFrame.{index, columns} attribute lookup + +**Bug fixes** + + - Fix O(K^2) memory leak caused by inserting many columns without + consolidating, had been present since 0.4.0 (GH #467) + - `DataFrame.count` should return Series with zero instead of NA with length-0 + axis (GH #423) + - Fix Yahoo! Finance API usage in pandas.io.data (GH #419, PR #427) + - Fix upstream bug causing failure in Series.align with empty Series (GH #434) + - Function passed to DataFrame.apply can return a list, as long as it's the + right length. Regression from 0.4 (GH #432) + - Don't "accidentally" upcast scalar values when indexing using .ix (GH #431) + - Fix groupby exception raised with as_index=False and single column selected + (GH #421) + - Implement DateOffset.__ne__ causing downstream bug (GH #456) + - Fix __doc__-related issue when converting py -> pyo with py2exe + - Bug fix in left join Cython code with duplicate monotonic labels + - Fix bug when unstacking multiple levels described in #451 + - Exclude NA values in dtype=object arrays, regression from 0.5.0 (GH #469) + - Use Cython map_infer function in DataFrame.applymap to properly infer + output type, handle tuple return values and other things that were breaking + (GH #465) + - Handle floating point index values in HDFStore (GH #454) + - Fixed stale column reference bug (cached Series object) caused by type + change / item deletion in DataFrame (GH #473) + - Index.get_loc should always raise Exception when there are duplicates + - Handle differently-indexed Series input to DataFrame constructor (GH #475) + - Omit nuisance columns in multi-groupby with Python function + - Buglet in handling of single grouping in general apply + - Handle type inference properly when passing list of lists or tuples to + DataFrame constructor (GH #484) + - Preserve Index / MultiIndex names in GroupBy.apply concatenation step (GH + #481) + +Thanks +------ +- Ralph Bean +- Luca Beltrame +- Marius Cobzarenco +- Andreas Hilboll +- Jev Kuznetsov +- Adam Lichtenstein +- Wouter Overmeire +- Fernando Perez +- Nathan Pinger +- Christian Prinoth +- Alex Reyfman +- Joon Ro +- Chang She +- Ted Square +- Chris Uga +- Dieter Vandenbussche + +pandas 0.6.0 +============ + +**Release date:** 11/25/2011 + +**API Changes** + + - Arithmetic methods like `sum` will attempt to sum dtype=object values by + default instead of excluding them (GH #382) + +**New features / modules** + + - Add `melt` function to `pandas.core.reshape` + - Add `level` parameter to group by level in Series and DataFrame + descriptive statistics (PR #313) + - Add `head` and `tail` methods to Series, analogous to to DataFrame (PR + #296) + - Add `Series.isin` function which checks if each value is contained in a + passed sequence (GH #289) + - Add `float_format` option to `Series.to_string` + - Add `skip_footer` (GH #291) and `converters` (GH #343) options to + `read_csv` and `read_table` + - Add proper, tested weighted least squares to standard and panel OLS (GH + #303) + - Add `drop_duplicates` and `duplicated` functions for removing duplicate + DataFrame rows and checking for duplicate rows, respectively (GH #319) + - Implement logical (boolean) operators &, |, ^ on DataFrame (GH #347) + - Add `Series.mad`, mean absolute deviation, matching DataFrame + - Add `QuarterEnd` DateOffset (PR #321) + - Add matrix multiplication function `dot` to DataFrame (GH #65) + - Add `orient` option to `Panel.from_dict` to ease creation of mixed-type + Panels (GH #359, #301) + - Add `DataFrame.from_dict` with similar `orient` option + - Can now pass list of tuples or list of lists to `DataFrame.from_records` + for fast conversion to DataFrame (GH #357) + - Can pass multiple levels to groupby, e.g. `df.groupby(level=[0, 1])` (GH + #103) + - Can sort by multiple columns in `DataFrame.sort_index` (GH #92, PR #362) + - Add fast `get_value` and `put_value` methods to DataFrame and + micro-performance tweaks (GH #360) + - Add `cov` instance methods to Series and DataFrame (GH #194, PR #362) + - Add bar plot option to `DataFrame.plot` (PR #348) + - Add `idxmin` and `idxmax` functions to Series and DataFrame for computing + index labels achieving maximum and minimum values (PR #286) + - Add `read_clipboard` function for parsing DataFrame from OS clipboard, + should work across platforms (GH #300) + - Add `nunique` function to Series for counting unique elements (GH #297) + - DataFrame constructor will use Series name if no columns passed (GH #373) + - Support regular expressions and longer delimiters in read_table/read_csv, + but does not handle quoted strings yet (GH #364) + - Add `DataFrame.to_html` for formatting DataFrame to HTML (PR #387) + - MaskedArray can be passed to DataFrame constructor and masked values will be + converted to NaN (PR #396) + - Add `DataFrame.boxplot` function (GH #368, others) + - Can pass extra args, kwds to DataFrame.apply (GH #376) + +**Improvements to existing features** + + - Raise more helpful exception if date parsing fails in DateRange (GH #298) + - Vastly improved performance of GroupBy on axes with a MultiIndex (GH #299) + - Print level names in hierarchical index in Series repr (GH #305) + - Return DataFrame when performing GroupBy on selected column and + as_index=False (GH #308) + - Can pass vector to `on` argument in `DataFrame.join` (GH #312) + - Don't show Series name if it's None in the repr, also omit length for short + Series (GH #317) + - Show legend by default in `DataFrame.plot`, add `legend` boolean flag (GH + #324) + - Significantly improved performance of `Series.order`, which also makes + np.unique called on a Series faster (GH #327) + - Faster cythonized count by level in Series and DataFrame (GH #341) + - Raise exception if dateutil 2.0 installed on Python 2.x runtime (GH #346) + - Significant GroupBy performance enhancement with multiple keys with many + "empty" combinations + - New Cython vectorized function `map_infer` speeds up `Series.apply` and + `Series.map` significantly when passed elementwise Python function, + motivated by PR #355 + - Cythonized `cache_readonly`, resulting in substantial micro-performance + enhancements throughout the codebase (GH #361) + - Special Cython matrix iterator for applying arbitrary reduction operations + with 3-5x better performance than `np.apply_along_axis` (GH #309) + - Add `raw` option to `DataFrame.apply` for getting better performance when + the passed function only requires an ndarray (GH #309) + - Improve performance of `MultiIndex.from_tuples` + - Can pass multiple levels to `stack` and `unstack` (GH #370) + - Can pass multiple values columns to `pivot_table` (GH #381) + - Can call `DataFrame.delevel` with standard Index with name set (GH #393) + - Use Series name in GroupBy for result index (GH #363) + - Refactor Series/DataFrame stat methods to use common set of NaN-friendly + function + - Handle NumPy scalar integers at C level in Cython conversion routines + +**Bug fixes** + + - Fix bug in `DataFrame.to_csv` when writing a DataFrame with an index + name (GH #290) + - DataFrame should clear its Series caches on consolidation, was causing + "stale" Series to be returned in some corner cases (GH #304) + - DataFrame constructor failed if a column had a list of tuples (GH #293) + - Ensure that `Series.apply` always returns a Series and implement + `Series.round` (GH #314) + - Support boolean columns in Cythonized groupby functions (GH #315) + - `DataFrame.describe` should not fail if there are no numeric columns, + instead return categorical describe (GH #323) + - Fixed bug which could cause columns to be printed in wrong order in + `DataFrame.to_string` if specific list of columns passed (GH #325) + - Fix legend plotting failure if DataFrame columns are integers (GH #326) + - Shift start date back by one month for Yahoo! Finance API in pandas.io.data + (GH #329) + - Fix `DataFrame.join` failure on unconsolidated inputs (GH #331) + - DataFrame.min/max will no longer fail on mixed-type DataFrame (GH #337) + - Fix `read_csv` / `read_table` failure when passing list to index_col that is + not in ascending order (GH #349) + - Fix failure passing Int64Index to Index.union when both are monotonic + - Fix error when passing SparseSeries to (dense) DataFrame constructor + - Added missing bang at top of setup.py (GH #352) + - Change `is_monotonic` on MultiIndex so it properly compares the tuples + - Fix MultiIndex outer join logic (GH #351) + - Set index name attribute with single-key groupby (GH #358) + - Bug fix in reflexive binary addition in Series and DataFrame for + non-commutative operations (like string concatenation) (GH #353) + - setupegg.py will invoke Cython (GH #192) + - Fix block consolidation bug after inserting column into MultiIndex (GH #366) + - Fix bug in join operations between Index and Int64Index (GH #367) + - Handle min_periods=0 case in moving window functions (GH #365) + - Fixed corner cases in DataFrame.apply/pivot with empty DataFrame (GH #378) + - Fixed repr exception when Series name is a tuple + - Always return DateRange from `asfreq` (GH #390) + - Pass level names to `swaplavel` (GH #379) + - Don't lose index names in `MultiIndex.droplevel` (GH #394) + - Infer more proper return type in `DataFrame.apply` when no columns or rows + depending on whether the passed function is a reduction (GH #389) + - Always return NA/NaN from Series.min/max and DataFrame.min/max when all of a + row/column/values are NA (GH #384) + - Enable partial setting with .ix / advanced indexing (GH #397) + - Handle mixed-type DataFrames correctly in unstack, do not lose type + information (GH #403) + - Fix integer name formatting bug in Index.format and in Series.__repr__ + - Handle label types other than string passed to groupby (GH #405) + - Fix bug in .ix-based indexing with partial retrieval when a label is not + contained in a level + - Index name was not being pickled (GH #408) + - Level name should be passed to result index in GroupBy.apply (GH #416) + +Thanks +------ + +- Craig Austin +- Marius Cobzarenco +- Joel Cross +- Jeff Hammerbacher +- Adam Klein +- Thomas Kluyver +- Jev Kuznetsov +- Kieran O'Mahony +- Wouter Overmeire +- Nathan Pinger +- Christian Prinoth +- Skipper Seabold +- Chang She +- Ted Square +- Aman Thakral +- Chris Uga +- Dieter Vandenbussche +- carljv +- rsamson + +pandas 0.5.0 +============ + +**Release date:** 10/24/2011 + +This release of pandas includes a number of API changes (see below) and cleanup +of deprecated APIs from pre-0.4.0 releases. There are also bug fixes, new +features, numerous significant performance enhancements, and includes a new +IPython completer hook to enable tab completion of DataFrame columns accesses +as attributes (a new feature). + +In addition to the changes listed here from 0.4.3 to 0.5.0, the minor releases +0.4.1, 0.4.2, and 0.4.3 brought some significant new functionality and +performance improvements that are worth taking a look at. + +Thanks to all for bug reports, contributed patches and generally providing +feedback on the library. + +**API Changes** + + - `read_table`, `read_csv`, and `ExcelFile.parse` default arguments for + `index_col` is now None. To use one or more of the columns as the resulting + DataFrame's index, these must be explicitly specified now + - Parsing functions like `read_csv` no longer parse dates by default (GH + #225) + - Removed `weights` option in panel regression which was not doing anything + principled (GH #155) + - Changed `buffer` argument name in `Series.to_string` to `buf` + - `Series.to_string` and `DataFrame.to_string` now return strings by default + instead of printing to sys.stdout + - Deprecated `nanRep` argument in various `to_string` and `to_csv` functions + in favor of `na_rep`. Will be removed in 0.6 (GH #275) + - Renamed `delimiter` to `sep` in `DataFrame.from_csv` for consistency + - Changed order of `Series.clip` arguments to match those of `numpy.clip` and + added (unimplemented) `out` argument so `numpy.clip` can be called on a + Series (GH #272) + - Series functions renamed (and thus deprecated) in 0.4 series have been + removed: + + * `asOf`, use `asof` + * `toDict`, use `to_dict` + * `toString`, use `to_string` + * `toCSV`, use `to_csv` + * `merge`, use `map` + * `applymap`, use `apply` + * `combineFirst`, use `combine_first` + * `_firstTimeWithValue` use `first_valid_index` + * `_lastTimeWithValue` use `last_valid_index` + + - DataFrame functions renamed / deprecated in 0.4 series have been removed: + + * `asMatrix` method, use `as_matrix` or `values` attribute + * `combineFirst`, use `combine_first` + * `getXS`, use `xs` + * `merge`, use `join` + * `fromRecords`, use `from_records` + * `fromcsv`, use `from_csv` + * `toRecords`, use `to_records` + * `toDict`, use `to_dict` + * `toString`, use `to_string` + * `toCSV`, use `to_csv` + * `_firstTimeWithValue` use `first_valid_index` + * `_lastTimeWithValue` use `last_valid_index` + * `toDataMatrix` is no longer needed + * `rows()` method, use `index` attribute + * `cols()` method, use `columns` attribute + * `dropEmptyRows()`, use `dropna(how='all')` + * `dropIncompleteRows()`, use `dropna()` + * `tapply(f)`, use `apply(f, axis=1)` + * `tgroupby(keyfunc, aggfunc)`, use `groupby` with `axis=1` + + - Other outstanding deprecations have been removed: + + * `indexField` argument in `DataFrame.from_records` + * `missingAtEnd` argument in `Series.order`. Use `na_last` instead + * `Series.fromValue` classmethod, use regular `Series` constructor instead + * Functions `parseCSV`, `parseText`, and `parseExcel` methods in + `pandas.io.parsers` have been removed + * `Index.asOfDate` function + * `Panel.getMinorXS` (use `minor_xs`) and `Panel.getMajorXS` (use + `major_xs`) + * `Panel.toWide`, use `Panel.to_wide` instead + +**New features / modules** + + - Added `DataFrame.align` method with standard join options + - Added `parse_dates` option to `read_csv` and `read_table` methods to + optionally try to parse dates in the index columns + - Add `nrows`, `chunksize`, and `iterator` arguments to `read_csv` and + `read_table`. The last two return a new `TextParser` class capable of + lazily iterating through chunks of a flat file (GH #242) + - Added ability to join on multiple columns in `DataFrame.join` (GH #214) + - Added private `_get_duplicates` function to `Index` for identifying + duplicate values more easily + - Added column attribute access to DataFrame, e.g. df.A equivalent to df['A'] + if 'A' is a column in the DataFrame (PR #213) + - Added IPython tab completion hook for DataFrame columns. (PR #233, GH #230) + - Implement `Series.describe` for Series containing objects (PR #241) + - Add inner join option to `DataFrame.join` when joining on key(s) (GH #248) + - Can select set of DataFrame columns by passing a list to `__getitem__` (GH + #253) + - Can use & and | to intersection / union Index objects, respectively (GH + #261) + - Added `pivot_table` convenience function to pandas namespace (GH #234) + - Implemented `Panel.rename_axis` function (GH #243) + - DataFrame will show index level names in console output + - Implemented `Panel.take` + - Add `set_eng_float_format` function for setting alternate DataFrame + floating point string formatting + - Add convenience `set_index` function for creating a DataFrame index from + its existing columns + +**Improvements to existing features** + + - Major performance improvements in file parsing functions `read_csv` and + `read_table` + - Added Cython function for converting tuples to ndarray very fast. Speeds up + many MultiIndex-related operations + - File parsing functions like `read_csv` and `read_table` will explicitly + check if a parsed index has duplicates and raise a more helpful exception + rather than deferring the check until later + - Refactored merging / joining code into a tidy class and disabled unnecessary + computations in the float/object case, thus getting about 10% better + performance (GH #211) + - Improved speed of `DataFrame.xs` on mixed-type DataFrame objects by about + 5x, regression from 0.3.0 (GH #215) + - With new `DataFrame.align` method, speeding up binary operations between + differently-indexed DataFrame objects by 10-25%. + - Significantly sped up conversion of nested dict into DataFrame (GH #212) + - Can pass hierarchical index level name to `groupby` instead of the level + number if desired (GH #223) + - Add support for different delimiters in `DataFrame.to_csv` (PR #244) + - Add more helpful error message when importing pandas post-installation from + the source directory (GH #250) + - Significantly speed up DataFrame `__repr__` and `count` on large mixed-type + DataFrame objects + - Better handling of pyx file dependencies in Cython module build (GH #271) + +**Bug fixes** + + - `read_csv` / `read_table` fixes + - Be less aggressive about converting float->int in cases of floating point + representations of integers like 1.0, 2.0, etc. + - "True"/"False" will not get correctly converted to boolean + - Index name attribute will get set when specifying an index column + - Passing column names should force `header=None` (GH #257) + - Don't modify passed column names when `index_col` is not + None (GH #258) + - Can sniff CSV separator in zip file (since seek is not supported, was + failing before) + - Worked around matplotlib "bug" in which series[:, np.newaxis] fails. Should + be reported upstream to matplotlib (GH #224) + - DataFrame.iteritems was not returning Series with the name attribute + set. Also neither was DataFrame._series + - Can store datetime.date objects in HDFStore (GH #231) + - Index and Series names are now stored in HDFStore + - Fixed problem in which data would get upcasted to object dtype in + GroupBy.apply operations (GH #237) + - Fixed outer join bug with empty DataFrame (GH #238) + - Can create empty Panel (GH #239) + - Fix join on single key when passing list with 1 entry (GH #246) + - Don't raise Exception on plotting DataFrame with an all-NA column (GH #251, + PR #254) + - Bug min/max errors when called on integer DataFrames (PR #241) + - `DataFrame.iteritems` and `DataFrame._series` not assigning name attribute + - Panel.__repr__ raised exception on length-0 major/minor axes + - `DataFrame.join` on key with empty DataFrame produced incorrect columns + - Implemented `MultiIndex.diff` (GH #260) + - `Int64Index.take` and `MultiIndex.take` lost name field, fix downstream + issue GH #262 + - Can pass list of tuples to `Series` (GH #270) + - Can pass level name to `DataFrame.stack` + - Support set operations between MultiIndex and Index + - Fix many corner cases in MultiIndex set operations + - Fix MultiIndex-handling bug with GroupBy.apply when returned groups are not + indexed the same + - Fix corner case bugs in DataFrame.apply + - Setting DataFrame index did not cause Series cache to get cleared + - Various int32 -> int64 platform-specific issues + - Don't be too aggressive converting to integer when parsing file with + MultiIndex (GH #285) + - Fix bug when slicing Series with negative indices before beginning + +Thanks +------ + +- Thomas Kluyver +- Daniel Fortunov +- Aman Thakral +- Luca Beltrame +- Wouter Overmeire + +pandas 0.4.3 +============ + +Release notes +------------- + +**Release date:** 10/9/2011 + +This is largely a bugfix release from 0.4.2 but also includes a handful of new +and enhanced features. Also, pandas can now be installed and used on Python 3 +(thanks Thomas Kluyver!). + +**New features / modules** + + - Python 3 support using 2to3 (PR #200, Thomas Kluyver) + - Add `name` attribute to `Series` and added relevant logic and tests. Name + now prints as part of `Series.__repr__` + - Add `name` attribute to standard Index so that stacking / unstacking does + not discard names and so that indexed DataFrame objects can be reliably + round-tripped to flat files, pickle, HDF5, etc. + - Add `isnull` and `notnull` as instance methods on Series (PR #209, GH #203) + +**Improvements to existing features** + + - Skip xlrd-related unit tests if not installed + - `Index.append` and `MultiIndex.append` can accept a list of Index objects to + concatenate together + - Altered binary operations on differently-indexed SparseSeries objects to use + the integer-based (dense) alignment logic which is faster with a larger + number of blocks (GH #205) + - Refactored `Series.__repr__` to be a bit more clean and consistent + +**API Changes** + + - `Series.describe` and `DataFrame.describe` now bring the 25% and 75% + quartiles instead of the 10% and 90% deciles. The other outputs have not + changed + - `Series.toString` will print deprecation warning, has been de-camelCased to + `to_string` + +**Bug fixes** + + - Fix broken interaction between `Index` and `Int64Index` when calling + intersection. Implement `Int64Index.intersection` + - `MultiIndex.sortlevel` discarded the level names (GH #202) + - Fix bugs in groupby, join, and append due to improper concatenation of + `MultiIndex` objects (GH #201) + - Fix regression from 0.4.1, `isnull` and `notnull` ceased to work on other + kinds of Python scalar objects like `datetime.datetime` + - Raise more helpful exception when attempting to write empty DataFrame or + LongPanel to `HDFStore` (GH #204) + - Use stdlib csv module to properly escape strings with commas in + `DataFrame.to_csv` (PR #206, Thomas Kluyver) + - Fix Python ndarray access in Cython code for sparse blocked index integrity + check + - Fix bug writing Series to CSV in Python 3 (PR #209) + - Miscellaneous Python 3 bugfixes + +Thanks +------ + + - Thomas Kluyver + - rsamson + +pandas 0.4.2 +============ + +Release notes +------------- + +**Release date:** 10/3/2011 + +This is a performance optimization release with several bug fixes. The new +Int64Index and new merging / joining Cython code and related Python +infrastructure are the main new additions + +**New features / modules** + + - Added fast `Int64Index` type with specialized join, union, + intersection. Will result in significant performance enhancements for + int64-based time series (e.g. using NumPy's datetime64 one day) and also + faster operations on DataFrame objects storing record array-like data. + - Refactored `Index` classes to have a `join` method and associated data + alignment routines throughout the codebase to be able to leverage optimized + joining / merging routines. + - Added `Series.align` method for aligning two series with choice of join + method + - Wrote faster Cython data alignment / merging routines resulting in + substantial speed increases + - Added `is_monotonic` property to `Index` classes with associated Cython + code to evaluate the monotonicity of the `Index` values + - Add method `get_level_values` to `MultiIndex` + - Implemented shallow copy of `BlockManager` object in `DataFrame` internals + +**Improvements to existing features** + + - Improved performance of `isnull` and `notnull`, a regression from v0.3.0 + (GH #187) + - Wrote templating / code generation script to auto-generate Cython code for + various functions which need to be available for the 4 major data types + used in pandas (float64, bool, object, int64) + - Refactored code related to `DataFrame.join` so that intermediate aligned + copies of the data in each `DataFrame` argument do not need to be + created. Substantial performance increases result (GH #176) + - Substantially improved performance of generic `Index.intersection` and + `Index.union` + - Improved performance of `DateRange.union` with overlapping ranges and + non-cacheable offsets (like Minute). Implemented analogous fast + `DateRange.intersection` for overlapping ranges. + - Implemented `BlockManager.take` resulting in significantly faster `take` + performance on mixed-type `DataFrame` objects (GH #104) + - Improved performance of `Series.sort_index` + - Significant groupby performance enhancement: removed unnecessary integrity + checks in DataFrame internals that were slowing down slicing operations to + retrieve groups + - Added informative Exception when passing dict to DataFrame groupby + aggregation with axis != 0 + +**API Changes** + +None + +**Bug fixes** + + - Fixed minor unhandled exception in Cython code implementing fast groupby + aggregation operations + - Fixed bug in unstacking code manifesting with more than 3 hierarchical + levels + - Throw exception when step specified in label-based slice (GH #185) + - Fix isnull to correctly work with np.float32. Fix upstream bug described in + GH #182 + - Finish implementation of as_index=False in groupby for DataFrame + aggregation (GH #181) + - Raise SkipTest for pre-epoch HDFStore failure. Real fix will be sorted out + via datetime64 dtype + +Thanks +------ + +- Uri Laserson +- Scott Sinclair + +pandas 0.4.1 +============ + +Release notes +------------- + +**Release date:** 9/25/2011 + +This is primarily a bug fix release but includes some new features and +improvements + +**New features / modules** + + - Added new `DataFrame` methods `get_dtype_counts` and property `dtypes` + - Setting of values using ``.ix`` indexing attribute in mixed-type DataFrame + objects has been implemented (fixes GH #135) + - `read_csv` can read multiple columns into a `MultiIndex`. DataFrame's + `to_csv` method will properly write out a `MultiIndex` which can be read + back (PR #151, thanks to Skipper Seabold) + - Wrote fast time series merging / joining methods in Cython. Will be + integrated later into DataFrame.join and related functions + - Added `ignore_index` option to `DataFrame.append` for combining unindexed + records stored in a DataFrame + +**Improvements to existing features** + + - Some speed enhancements with internal Index type-checking function + - `DataFrame.rename` has a new `copy` parameter which can rename a DataFrame + in place + - Enable unstacking by level name (PR #142) + - Enable sortlevel to work by level name (PR #141) + - `read_csv` can automatically "sniff" other kinds of delimiters using + `csv.Sniffer` (PR #146) + - Improved speed of unit test suite by about 40% + - Exception will not be raised calling `HDFStore.remove` on non-existent node + with where clause + - Optimized `_ensure_index` function resulting in performance savings in + type-checking Index objects + +**API Changes** + +None + +**Bug fixes** + + - Fixed DataFrame constructor bug causing downstream problems (e.g. .copy() + failing) when passing a Series as the values along with a column name and + index + - Fixed single-key groupby on DataFrame with as_index=False (GH #160) + - `Series.shift` was failing on integer Series (GH #154) + - `unstack` methods were producing incorrect output in the case of duplicate + hierarchical labels. An exception will now be raised (GH #147) + - Calling `count` with level argument caused reduceat failure or segfault in + earlier NumPy (GH #169) + - Fixed `DataFrame.corrwith` to automatically exclude non-numeric data (GH + #144) + - Unicode handling bug fixes in `DataFrame.to_string` (GH #138) + - Excluding OLS degenerate unit test case that was causing platform specific + failure (GH #149) + - Skip blosc-dependent unit tests for PyTables < 2.2 (PR #137) + - Calling `copy` on `DateRange` did not copy over attributes to the new object + (GH #168) + - Fix bug in `HDFStore` in which Panel data could be appended to a Table with + different item order, thus resulting in an incorrect result read back + +Thanks +------ +- Yaroslav Halchenko +- Jeff Reback +- Skipper Seabold +- Dan Lovell +- Nick Pentreath + +pandas 0.4.0 +============ + +Release notes +------------- + +**Release date:** 9/12/2011 + +**New features / modules** + + - `pandas.core.sparse` module: "Sparse" (mostly-NA, or some other fill value) + versions of `Series`, `DataFrame`, and `Panel`. For low-density data, this + will result in significant performance boosts, and smaller memory + footprint. Added `to_sparse` methods to `Series`, `DataFrame`, and + `Panel`. See online documentation for more on these + - Fancy indexing operator on Series / DataFrame, e.g. via .ix operator. Both + getting and setting of values is supported; however, setting values will only + currently work on homogeneously-typed DataFrame objects. Things like: + + * series.ix[[d1, d2, d3]] + * frame.ix[5:10, ['C', 'B', 'A']], frame.ix[5:10, 'A':'C'] + * frame.ix[date1:date2] + + - Significantly enhanced `groupby` functionality + + * Can groupby multiple keys, e.g. df.groupby(['key1', 'key2']). Iteration with + multiple groupings products a flattened tuple + * "Nuisance" columns (non-aggregatable) will automatically be excluded from + DataFrame aggregation operations + * Added automatic "dispatching to Series / DataFrame methods to more easily + invoke methods on groups. e.g. s.groupby(crit).std() will work even though + `std` is not implemented on the `GroupBy` class + + - Hierarchical / multi-level indexing + + * New the `MultiIndex` class. Integrated `MultiIndex` into `Series` and + `DataFrame` fancy indexing, slicing, __getitem__ and __setitem, + reindexing, etc. Added `level` keyword argument to `groupby` to enable + grouping by a level of a `MultiIndex` + + - New data reshaping functions: `stack` and `unstack` on DataFrame and Series + + * Integrate with MultiIndex to enable sophisticated reshaping of data + + - `Index` objects (labels for axes) are now capable of holding tuples + - `Series.describe`, `DataFrame.describe`: produces an R-like table of summary + statistics about each data column + - `DataFrame.quantile`, `Series.quantile` for computing sample quantiles of data + across requested axis + - Added general `DataFrame.dropna` method to replace `dropIncompleteRows` and + `dropEmptyRows`, deprecated those. + - `Series` arithmetic methods with optional fill_value for missing data, + e.g. a.add(b, fill_value=0). If a location is missing for both it will still + be missing in the result though. + - fill_value option has been added to `DataFrame`.{add, mul, sub, div} methods + similar to `Series` + - Boolean indexing with `DataFrame` objects: data[data > 0.1] = 0.1 or + data[data> other] = 1. + - `pytz` / tzinfo support in `DateRange` + + * `tz_localize`, `tz_normalize`, and `tz_validate` methods added + + - Added `ExcelFile` class to `pandas.io.parsers` for parsing multiple sheets out + of a single Excel 2003 document + - `GroupBy` aggregations can now optionally *broadcast*, e.g. produce an object + of the same size with the aggregated value propagated + - Added `select` function in all data structures: reindex axis based on + arbitrary criterion (function returning boolean value), + e.g. frame.select(lambda x: 'foo' in x, axis=1) + - `DataFrame.consolidate` method, API function relating to redesigned internals + - `DataFrame.insert` method for inserting column at a specified location rather + than the default __setitem__ behavior (which puts it at the end) + - `HDFStore` class in `pandas.io.pytables` has been largely rewritten using + patches from Jeff Reback from others. It now supports mixed-type `DataFrame` + and `Series` data and can store `Panel` objects. It also has the option to + query `DataFrame` and `Panel` data. Loading data from legacy `HDFStore` + files is supported explicitly in the code + - Added `set_printoptions` method to modify appearance of DataFrame tabular + output + - `rolling_quantile` functions; a moving version of `Series.quantile` / + `DataFrame.quantile` + - Generic `rolling_apply` moving window function + - New `drop` method added to `Series`, `DataFrame`, etc. which can drop a set of + labels from an axis, producing a new object + - `reindex` methods now sport a `copy` option so that data is not forced to be + copied then the resulting object is indexed the same + - Added `sort_index` methods to Series and Panel. Renamed `DataFrame.sort` + to `sort_index`. Leaving `DataFrame.sort` for now. + - Added ``skipna`` option to statistical instance methods on all the data + structures + - `pandas.io.data` module providing a consistent interface for reading time + series data from several different sources + +**Improvements to existing features** + + * The 2-dimensional `DataFrame` and `DataMatrix` classes have been extensively + redesigned internally into a single class `DataFrame`, preserving where + possible their optimal performance characteristics. This should reduce + confusion from users about which class to use. + + * Note that under the hood there is a new essentially "lazy evaluation" + scheme within respect to adding columns to DataFrame. During some + operations, like-typed blocks will be "consolidated" but not before. + + * `DataFrame` accessing columns repeatedly is now significantly faster than + `DataMatrix` used to be in 0.3.0 due to an internal Series caching mechanism + (which are all views on the underlying data) + * Column ordering for mixed type data is now completely consistent in + `DataFrame`. In prior releases, there was inconsistent column ordering in + `DataMatrix` + * Improved console / string formatting of DataMatrix with negative numbers + * Improved tabular data parsing functions, `read_table` and `read_csv`: + + * Added `skiprows` and `na_values` arguments to `pandas.io.parsers` functions + for more flexible IO + * `parseCSV` / `read_csv` functions and others in `pandas.io.parsers` now can + take a list of custom NA values, and also a list of rows to skip + + * Can slice `DataFrame` and get a view of the data (when homogeneously typed), + e.g. frame.xs(idx, copy=False) or frame.ix[idx] + * Many speed optimizations throughout `Series` and `DataFrame` + * Eager evaluation of groups when calling ``groupby`` functions, so if there is + an exception with the grouping function it will raised immediately versus + sometime later on when the groups are needed + * `datetools.WeekOfMonth` offset can be parameterized with `n` different than 1 + or -1. + * Statistical methods on DataFrame like `mean`, `std`, `var`, `skew` will now + ignore non-numerical data. Before a not very useful error message was + generated. A flag `numeric_only` has been added to `DataFrame.sum` and + `DataFrame.count` to enable this behavior in those methods if so desired + (disabled by default) + * `DataFrame.pivot` generalized to enable pivoting multiple columns into a + `DataFrame` with hierarchical columns + * `DataFrame` constructor can accept structured / record arrays + * `Panel` constructor can accept a dict of DataFrame-like objects. Do not + need to use `from_dict` anymore (`from_dict` is there to stay, though). + +**API Changes** + + * The `DataMatrix` variable now refers to `DataFrame`, will be removed within + two releases + * `WidePanel` is now known as `Panel`. The `WidePanel` variable in the pandas + namespace now refers to the renamed `Panel` class + * `LongPanel` and `Panel` / `WidePanel` now no longer have a common + subclass. `LongPanel` is now a subclass of `DataFrame` having a number of + additional methods and a hierarchical index instead of the old + `LongPanelIndex` object, which has been removed. Legacy `LongPanel` pickles + may not load properly + * Cython is now required to build `pandas` from a development branch. This was + done to avoid continuing to check in cythonized C files into source + control. Builds from released source distributions will not require Cython + * Cython code has been moved up to a top level `pandas/src` directory. Cython + extension modules have been renamed and promoted from the `lib` subpackage to + the top level, i.e. + + * `pandas.lib.tseries` -> `pandas._tseries` + * `pandas.lib.sparse` -> `pandas._sparse` + + * `DataFrame` pickling format has changed. Backwards compatibility for legacy + pickles is provided, but it's recommended to consider PyTables-based + `HDFStore` for storing data with a longer expected shelf life + * A `copy` argument has been added to the `DataFrame` constructor to avoid + unnecessary copying of data. Data is no longer copied by default when passed + into the constructor + * Handling of boolean dtype in `DataFrame` has been improved to support storage + of boolean data with NA / NaN values. Before it was being converted to float64 + so this should not (in theory) cause API breakage + * To optimize performance, Index objects now only check that their labels are + unique when uniqueness matters (i.e. when someone goes to perform a + lookup). This is a potentially dangerous tradeoff, but will lead to much + better performance in many places (like groupby). + * Boolean indexing using Series must now have the same indices (labels) + * Backwards compatibility support for begin/end/nPeriods keyword arguments in + DateRange class has been removed + * More intuitive / shorter filling aliases `ffill` (for `pad`) and `bfill` (for + `backfill`) have been added to the functions that use them: `reindex`, + `asfreq`, `fillna`. + * `pandas.core.mixins` code moved to `pandas.core.generic` + * `buffer` keyword arguments (e.g. `DataFrame.toString`) renamed to `buf` to + avoid using Python built-in name + * `DataFrame.rows()` removed (use `DataFrame.index`) + * Added deprecation warning to `DataFrame.cols()`, to be removed in next release + * `DataFrame` deprecations and de-camelCasing: `merge`, `asMatrix`, + `toDataMatrix`, `_firstTimeWithValue`, `_lastTimeWithValue`, `toRecords`, + `fromRecords`, `tgroupby`, `toString` + * `pandas.io.parsers` method deprecations + + * `parseCSV` is now `read_csv` and keyword arguments have been de-camelCased + * `parseText` is now `read_table` + * `parseExcel` is replaced by the `ExcelFile` class and its `parse` method + + * `fillMethod` arguments (deprecated in prior release) removed, should be + replaced with `method` + * `Series.fill`, `DataFrame.fill`, and `Panel.fill` removed, use `fillna` + instead + * `groupby` functions now exclude NA / NaN values from the list of groups. This + matches R behavior with NAs in factors e.g. with the `tapply` function + * Removed `parseText`, `parseCSV` and `parseExcel` from pandas namespace + * `Series.combineFunc` renamed to `Series.combine` and made a bit more general + with a `fill_value` keyword argument defaulting to NaN + * Removed `pandas.core.pytools` module. Code has been moved to + `pandas.core.common` + * Tacked on `groupName` attribute for groups in GroupBy renamed to `name` + * Panel/LongPanel `dims` attribute renamed to `shape` to be more conformant + * Slicing a `Series` returns a view now + * More Series deprecations / renaming: `toCSV` to `to_csv`, `asOf` to `asof`, + `merge` to `map`, `applymap` to `apply`, `toDict` to `to_dict`, + `combineFirst` to `combine_first`. Will print `FutureWarning`. + * `DataFrame.to_csv` does not write an "index" column label by default + anymore since the output file can be read back without it. However, there + is a new ``index_label`` argument. So you can do ``index_label='index'`` to + emulate the old behavior + * `datetools.Week` argument renamed from `dayOfWeek` to `weekday` + * `timeRule` argument in `shift` has been deprecated in favor of using the + `offset` argument for everything. So you can still pass a time rule string + to `offset` + * Added optional `encoding` argument to `read_csv`, `read_table`, `to_csv`, + `from_csv` to handle unicode in python 2.x + +**Bug fixes** + + * Column ordering in `pandas.io.parsers.parseCSV` will match CSV in the presence + of mixed-type data + * Fixed handling of Excel 2003 dates in `pandas.io.parsers` + * `DateRange` caching was happening with high resolution `DateOffset` objects, + e.g. `DateOffset(seconds=1)`. This has been fixed + * Fixed __truediv__ issue in `DataFrame` + * Fixed `DataFrame.toCSV` bug preventing IO round trips in some cases + * Fixed bug in `Series.plot` causing matplotlib to barf in exceptional cases + * Disabled `Index` objects from being hashable, like ndarrays + * Added `__ne__` implementation to `Index` so that operations like ts[ts != idx] + will work + * Added `__ne__` implementation to `DataFrame` + * Bug / unintuitive result when calling `fillna` on unordered labels + * Bug calling `sum` on boolean DataFrame + * Bug fix when creating a DataFrame from a dict with scalar values + * Series.{sum, mean, std, ...} now return NA/NaN when the whole Series is NA + * NumPy 1.4 through 1.6 compatibility fixes + * Fixed bug in bias correction in `rolling_cov`, was affecting `rolling_corr` + too + * R-square value was incorrect in the presence of fixed and time effects in + the `PanelOLS` classes + * `HDFStore` can handle duplicates in table format, will take + +Thanks +------ + - Joon Ro + - Michael Pennington + - Chris Uga + - Chris Withers + - Jeff Reback + - Ted Square + - Craig Austin + - William Ferreira + - Daniel Fortunov + - Tony Roberts + - Martin Felder + - John Marino + - Tim McNamara + - Justin Berka + - Dieter Vandenbussche + - Shane Conway + - Skipper Seabold + - Chris Jordan-Squire + +pandas 0.3.0 +============ + +This major release of pandas represents approximately 1 year of continuous +development work and brings with it many new features, bug fixes, speed +enhancements, and general quality-of-life improvements. The most significant +change from the 0.2 release has been the completion of a rigorous unit test +suite covering all of the core functionality. + +Release notes +------------- + +**Release date:** February 20, 2011 + +**New features / modules** + +* DataFrame / DataMatrix classes + + * `corrwith` function to compute column- or row-wise correlations between two + objects + * Can boolean-index DataFrame objects, e.g. df[df > 2] = 2, px[px > last_px] = 0 + * Added comparison magic methods (__lt__, __gt__, etc.) + * Flexible explicit arithmetic methods (add, mul, sub, div, etc.) + * Added `reindex_like` method + +* WidePanel + + * Added `reindex_like` method + +* `pandas.io`: IO utilities + + * `pandas.io.sql` module + + * Convenience functions for accessing SQL-like databases + + * `pandas.io.pytables` module + + * Added (still experimental) HDFStore class for storing pandas data + structures using HDF5 / PyTables + +* `pandas.core.datetools` + + * Added WeekOfMonth date offset + +* `pandas.rpy` (experimental) module created, provide some interfacing / + conversion between rpy2 and pandas + +**Improvements** + +* Unit test coverage: 100% line coverage of core data structures + +* Speed enhancement to rolling_{median, max, min} + +* Column ordering between DataFrame and DataMatrix is now consistent: before + DataFrame would not respect column order + +* Improved {Series, DataFrame}.plot methods to be more flexible (can pass + matplotlib Axis arguments, plot DataFrame columns in multiple subplots, etc.) + +**API Changes** + +* Exponentially-weighted moment functions in `pandas.stats.moments` + have a more consistent API and accept a min_periods argument like + their regular moving counterparts. + +* **fillMethod** argument in Series, DataFrame changed to **method**, + `FutureWarning` added. + +* **fill** method in Series, DataFrame/DataMatrix, WidePanel renamed to + **fillna**, `FutureWarning` added to **fill** + +* Renamed **DataFrame.getXS** to **xs**, `FutureWarning` added + +* Removed **cap** and **floor** functions from DataFrame, renamed to + **clip_upper** and **clip_lower** for consistency with NumPy + +**Bug fixes** + +* Fixed bug in IndexableSkiplist Cython code that was breaking + rolling_max function + +* Numerous numpy.int64-related indexing fixes + +* Several NumPy 1.4.0 NaN-handling fixes + +* Bug fixes to pandas.io.parsers.parseCSV + +* Fixed `DateRange` caching issue with unusual date offsets + +* Fixed bug in `DateRange.union` + +* Fixed corner case in `IndexableSkiplist` implementation diff --git a/TODO.rst b/TODO.rst new file mode 100644 index 00000000..b3f9d65e --- /dev/null +++ b/TODO.rst @@ -0,0 +1,60 @@ +DOCS 0.7.0 +---------- +- ??? no sort in groupby +- DONE concat with dict +- Gotchas re: integer indexing + +DONE +---- +- SparseSeries name integration + tests +- Refactor Series.repr + +TODO +---- +- _consolidate, does it always copy? +- Series.align with fill method. Will have to generate more Cython code +- TYPE inference in Index-- more than just datetime! + +TODO docs +--------- + +- DONE read_csv / read_table + - auto-sniff delimiter + - MultiIndex + - generally more documentation +- DONE pivot_table +- DONE Set mixed-type values with .ix +- DONE get_dtype_counts / dtypes +- DONE save / load functions +- DONE isnull/notnull as instance methods +- DONE DataFrame.to_string +- DONE IPython tab complete hook +- DONE ignore_index in DataFrame.append +- DONE describe for Series with dtype=object +- DONE as_index=False in groupby +- DONOTWANT is_monotonic +- DONE DataFrame.to_csv: different delimiters +- DONE combine_first +- DONE groupby with level name +- DONE MultiIndex get_level_values +- DONE & and | for intersection / union +- DONE Update to reflect Python 3 support in intro +- DONE Index / MultiIndex names +- DONE Unstack / stack by level name +- DONE name attribute on Series +- DONE Multi-key joining +- DONE Inner join on key +- DONE align functions +- DONE df[col_list] +- DONE Panel.rename_axis + +Performance blog +---------------- +- Series / Time series data alignment +- DataFrame alignment +- Groupby +- joining +- Take + +git log v0.6.1..master --pretty=format:%aN | sort | uniq -c | sort -rn +git log a8c2f88..master --pretty=format:%aN | sort | uniq -c | sort -rn diff --git a/bench/alignment.py b/bench/alignment.py new file mode 100644 index 00000000..bf5d5604 --- /dev/null +++ b/bench/alignment.py @@ -0,0 +1,21 @@ +# Setup +import numpy as np +import pandas +import la +N = 1000 +K = 50 +arr1 = np.random.randn(N, K) +arr2 = np.random.randn(N, K) +idx1 = range(N) +idx2 = range(K) + +# pandas +dma1 = pandas.DataFrame(arr1, idx1, idx2) +dma2 = pandas.DataFrame(arr2, idx1[::-1], idx2[::-1]) + +# larry +lar1 = la.larry(arr1, [idx1, idx2]) +lar2 = la.larry(arr2, [idx1[::-1], idx2[::-1]]) + +for i in range(100): + result = lar1 + lar2 diff --git a/bench/bench_dense_to_sparse.py b/bench/bench_dense_to_sparse.py new file mode 100644 index 00000000..349d3b31 --- /dev/null +++ b/bench/bench_dense_to_sparse.py @@ -0,0 +1,15 @@ +from pandas import * + +K = 100 +N = 100000 +rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute()) + +rng2 = np.asarray(rng).astype('M8[us]').astype('i8') + +series = {} +for i in range(1, K + 1): + data = np.random.randn(N)[:-i] + this_rng = rng2[:-i] + data[100:] = np.nan + series[i] = SparseSeries(data, index=this_rng) + diff --git a/bench/bench_get_put_value.py b/bench/bench_get_put_value.py new file mode 100644 index 00000000..5aa984d3 --- /dev/null +++ b/bench/bench_get_put_value.py @@ -0,0 +1,48 @@ +from pandas import * +from pandas.util.testing import rands + +N = 1000 +K = 50 + +def _random_index(howmany): + return Index([rands(10) for _ in xrange(howmany)]) + +df = DataFrame(np.random.randn(N, K), index=_random_index(N), + columns=_random_index(K)) + +def get1(): + for col in df.columns: + for row in df.index: + _ = df[col][row] + +def get2(): + for col in df.columns: + for row in df.index: + _ = df.get_value(row, col) + +def put1(): + for col in df.columns: + for row in df.index: + df[col][row] = 0 + +def put2(): + for col in df.columns: + for row in df.index: + df.set_value(row, col, 0) + +def resize1(): + buf = DataFrame() + for col in df.columns: + for row in df.index: + buf = buf.set_value(row, col, 5.) + return buf + +def resize2(): + from collections import defaultdict + + buf = defaultdict(dict) + for col in df.columns: + for row in df.index: + buf[col][row] = 5. + + return DataFrame(buf) diff --git a/bench/bench_groupby.py b/bench/bench_groupby.py new file mode 100644 index 00000000..78e2c51a --- /dev/null +++ b/bench/bench_groupby.py @@ -0,0 +1,61 @@ +from pandas import * +from pandas.util.testing import rands + +import string +import random + +k = 20000 +n = 10 + +foo = np.tile(np.array([rands(10) for _ in xrange(k)], dtype='O'), n) +foo2 = list(foo) +random.shuffle(foo) +random.shuffle(foo2) + +df = DataFrame({'A' : foo, + 'B' : foo2, + 'C' : np.random.randn(n * k)}) + +import pandas._sandbox as sbx + +def f(): + table = sbx.StringHashTable(len(df)) + ret = table.factorize(df['A']) + return ret +def g(): + table = sbx.PyObjectHashTable(len(df)) + ret = table.factorize(df['A']) + return ret + +ret = f() + +""" +import pandas._tseries as lib + +f = np.std + + +grouped = df.groupby(['A', 'B']) + +label_list = [ping.labels for ping in grouped.groupings] +shape = [len(ping.ids) for ping in grouped.groupings] + +from pandas.core.groupby import get_group_index + + +group_index = get_group_index(label_list, shape).astype('i4') + +ngroups = np.prod(shape) + +indexer = lib.groupsort_indexer(group_index, ngroups) + +values = df['C'].values.take(indexer) +group_index = group_index.take(indexer) + +f = lambda x: x.std(ddof=1) + +grouper = lib.Grouper(df['C'], np.ndarray.std, group_index, ngroups) +result = grouper.get_result() + +expected = grouped.std() +""" diff --git a/bench/bench_join_panel.py b/bench/bench_join_panel.py new file mode 100644 index 00000000..59a4711c --- /dev/null +++ b/bench/bench_join_panel.py @@ -0,0 +1,77 @@ +# reasonably effecient + +def create_panels_append(cls, panels): + """ return an append list of panels """ + panels = [ a for a in panels if a is not None ] + # corner cases + if len(panels) == 0: + return None + elif len(panels) == 1: + return panels[0] + elif len(panels) == 2 and panels[0] == panels[1]: + return panels[0] + #import pdb; pdb.set_trace() + # create a joint index for the axis + def joint_index_for_axis(panels, axis): + s = set() + for p in panels: + s.update(list(getattr(p,axis))) + return sorted(list(s)) + def reindex_on_axis(panels, axis, axis_reindex): + new_axis = joint_index_for_axis(panels, axis) + new_panels = [ p.reindex(**{ axis_reindex : new_axis, 'copy' : False}) for p in panels ] + return new_panels, new_axis + # create the joint major index, dont' reindex the sub-panels - we are appending + major = joint_index_for_axis(panels, 'major_axis') + # reindex on minor axis + panels, minor = reindex_on_axis(panels, 'minor_axis', 'minor') + # reindex on items + panels, items = reindex_on_axis(panels, 'items', 'items') + # concatenate values + try: + values = np.concatenate([ p.values for p in panels ],axis=1) + except (Exception), detail: + raise Exception("cannot append values that dont' match dimensions! -> [%s] %s" % (','.join([ "%s" % p for p in panels ]),str(detail))) + #pm('append - create_panel') + p = Panel(values, items = items, major_axis = major, minor_axis = minor ) + #pm('append - done') + return p + + + +# does the job but inefficient (better to handle like you read a table in pytables...e.g create a LongPanel then convert to Wide) + +def create_panels_join(cls, panels): + """ given an array of panels's, create a single panel """ + panels = [ a for a in panels if a is not None ] + # corner cases + if len(panels) == 0: + return None + elif len(panels) == 1: + return panels[0] + elif len(panels) == 2 and panels[0] == panels[1]: + return panels[0] + d = dict() + minor, major, items = set(), set(), set() + for panel in panels: + items.update(panel.items) + major.update(panel.major_axis) + minor.update(panel.minor_axis) + values = panel.values + for item, item_index in panel.items.indexMap.items(): + for minor_i, minor_index in panel.minor_axis.indexMap.items(): + for major_i, major_index in panel.major_axis.indexMap.items(): + try: + d[(minor_i,major_i,item)] = values[item_index,major_index,minor_index] + except: + pass + # stack the values + minor = sorted(list(minor)) + major = sorted(list(major)) + items = sorted(list(items)) + # create the 3d stack (items x columns x indicies) + data = np.dstack([ np.asarray([ np.asarray([ d.get((minor_i,major_i,item),np.nan) for item in items ]) for major_i in major ]).transpose() for minor_i in minor ]) + # construct the panel + return Panel(data, items, major, minor) +add_class_method(Panel, create_panels_join, 'join_many') + diff --git a/bench/bench_khash_dict.py b/bench/bench_khash_dict.py new file mode 100644 index 00000000..1d803bec --- /dev/null +++ b/bench/bench_khash_dict.py @@ -0,0 +1,78 @@ +""" +Some comparisons of khash.h to Python dict +""" + +import numpy as np +import os + +from vbench.api import Benchmark +from pandas.util.testing import rands +import pandas._tseries as lib +import pandas._sandbox as sbx +import time + +import psutil + +pid = os.getpid() +proc = psutil.Process(pid) + +def object_test_data(n): + pass + +def string_test_data(n): + return np.array([rands(10) for _ in xrange(n)], dtype='O') + +def int_test_data(n): + return np.arange(n, dtype='i8') + +N = 1000000 + +#---------------------------------------------------------------------- +# Benchmark 1: map_locations + +def map_locations_python_object(): + arr = string_test_data(N) + return _timeit(lambda: lib.map_indices_object(arr)) + +def map_locations_khash_object(): + arr = string_test_data(N) + def f(): + table = sbx.PyObjectHashTable(len(arr)) + table.map_locations(arr) + return _timeit(f) + +def _timeit(f, iterations=10): + start = time.time() + for _ in xrange(iterations): + foo = f() + elapsed = time.time() - start + return elapsed + +#---------------------------------------------------------------------- +# Benchmark 2: lookup_locations + +def lookup_python(values): + table = lib.map_indices_object(values) + return _timeit(lambda: lib.merge_indexer_object(values, table)) + +def lookup_khash(values): + table = sbx.PyObjectHashTable(len(values)) + table.map_locations(values) + locs = table.lookup_locations(values) + # elapsed = _timeit(lambda: table.lookup_locations2(values)) + return table + +def leak(values): + for _ in xrange(100): + print proc.get_memory_info() + table = lookup_khash(values) + # table.destroy() + +arr = string_test_data(N) + +#---------------------------------------------------------------------- +# Benchmark 3: unique + +#---------------------------------------------------------------------- +# Benchmark 4: factorize + diff --git a/bench/bench_merge.R b/bench/bench_merge.R new file mode 100644 index 00000000..3ed46184 --- /dev/null +++ b/bench/bench_merge.R @@ -0,0 +1,161 @@ +library(plyr) +library(data.table) +N <- 10000 +indices = rep(NA, N) +indices2 = rep(NA, N) +for (i in 1:N) { + indices[i] <- paste(sample(letters, 10), collapse="") + indices2[i] <- paste(sample(letters, 10), collapse="") +} +left <- data.frame(key=rep(indices[1:8000], 10), + key2=rep(indices2[1:8000], 10), + value=rnorm(80000)) +right <- data.frame(key=indices[2001:10000], + key2=indices2[2001:10000], + value2=rnorm(8000)) + +right2 <- data.frame(key=rep(right$key, 2), + key2=rep(right$key2, 2), + value2=rnorm(16000)) + +left.dt <- data.table(left, key=c("key", "key2")) +right.dt <- data.table(right, key=c("key", "key2")) +right2.dt <- data.table(right2, key=c("key", "key2")) + +# left.dt2 <- data.table(left) +# right.dt2 <- data.table(right) + +## left <- data.frame(key=rep(indices[1:1000], 10), +## key2=rep(indices2[1:1000], 10), +## value=rnorm(100000)) +## right <- data.frame(key=indices[1:1000], +## key2=indices2[1:1000], +## value2=rnorm(10000)) + +timeit <- function(func, niter=10) { + timing = rep(NA, niter) + for (i in 1:niter) { + gc() + timing[i] <- system.time(func())[3] + } + mean(timing) +} + +left.join <- function(sort=FALSE) { + result <- base::merge(left, right, all.x=TRUE, sort=sort) +} + +right.join <- function(sort=FALSE) { + result <- base::merge(left, right, all.y=TRUE, sort=sort) +} + +outer.join <- function(sort=FALSE) { + result <- base::merge(left, right, all=TRUE, sort=sort) +} + +inner.join <- function(sort=FALSE) { + result <- base::merge(left, right, all=FALSE, sort=sort) +} + +left.join.dt <- function(sort=FALSE) { + result <- right.dt[left.dt] +} + +right.join.dt <- function(sort=FALSE) { + result <- left.dt[right.dt] +} + +outer.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right.dt, all=TRUE, sort=sort) +} + +inner.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right.dt, all=FALSE, sort=sort) +} + +plyr.join <- function(type) { + result <- plyr::join(left, right, by=c("key", "key2"), + type=type, match="first") +} + +sort.options <- c(FALSE, TRUE) + +# many-to-one + +results <- matrix(nrow=4, ncol=3) +colnames(results) <- c("base::merge", "plyr", "data.table") +rownames(results) <- c("inner", "outer", "left", "right") + +base.functions <- c(inner.join, outer.join, left.join, right.join) +plyr.functions <- c(function() plyr.join("inner"), + function() plyr.join("full"), + function() plyr.join("left"), + function() plyr.join("right")) +dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) +for (i in 1:4) { + base.func <- base.functions[[i]] + plyr.func <- plyr.functions[[i]] + dt.func <- dt.functions[[i]] + results[i, 1] <- timeit(base.func) + results[i, 2] <- timeit(plyr.func) + results[i, 3] <- timeit(dt.func) +} + + +# many-to-many + +left.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all.x=TRUE, sort=sort) +} + +right.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all.y=TRUE, sort=sort) +} + +outer.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all=TRUE, sort=sort) +} + +inner.join <- function(sort=FALSE) { + result <- base::merge(left, right2, all=FALSE, sort=sort) +} + +left.join.dt <- function(sort=FALSE) { + result <- right2.dt[left.dt] +} + +right.join.dt <- function(sort=FALSE) { + result <- left.dt[right2.dt] +} + +outer.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right2.dt, all=TRUE, sort=sort) +} + +inner.join.dt <- function(sort=FALSE) { + result <- merge(left.dt, right2.dt, all=FALSE, sort=sort) +} + +sort.options <- c(FALSE, TRUE) + +# many-to-one + +results <- matrix(nrow=4, ncol=3) +colnames(results) <- c("base::merge", "plyr", "data.table") +rownames(results) <- c("inner", "outer", "left", "right") + +base.functions <- c(inner.join, outer.join, left.join, right.join) +plyr.functions <- c(function() plyr.join("inner"), + function() plyr.join("full"), + function() plyr.join("left"), + function() plyr.join("right")) +dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt) +for (i in 1:4) { + base.func <- base.functions[[i]] + plyr.func <- plyr.functions[[i]] + dt.func <- dt.functions[[i]] + results[i, 1] <- timeit(base.func) + results[i, 2] <- timeit(plyr.func) + results[i, 3] <- timeit(dt.func) +} + diff --git a/bench/bench_merge.py b/bench/bench_merge.py new file mode 100644 index 00000000..9dc21b78 --- /dev/null +++ b/bench/bench_merge.py @@ -0,0 +1,104 @@ +from pandas import * +from pandas.util.testing import rands +import random + +N = 10000 +ngroups = 10 + +def get_test_data(ngroups=100, n=N): + unique_groups = range(ngroups) + arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], + dtype=object) + + random.shuffle(arr) + return arr + +# aggregate multiple columns +# df = DataFrame({'key1' : get_test_data(ngroups=ngroups), +# 'key2' : get_test_data(ngroups=ngroups), +# 'data1' : np.random.randn(N), +# 'data2' : np.random.randn(N)}) + +# df2 = DataFrame({'key1' : get_test_data(ngroups=ngroups, n=N//10), +# 'key2' : get_test_data(ngroups=ngroups//2, n=N//10), +# 'value' : np.random.randn(N // 10)}) +# result = merge.merge(df, df2, on='key2') + +from collections import defaultdict +import gc +import time +from pandas.util.testing import rands +N = 10000 + +indices = np.array([rands(10) for _ in xrange(N)], dtype='O') +indices2 = np.array([rands(10) for _ in xrange(N)], dtype='O') +key = np.tile(indices[:8000], 10) +key2 = np.tile(indices2[:8000], 10) + +left = DataFrame({'key' : key, 'key2':key2, + 'value' : np.random.randn(80000)}) +right = DataFrame({'key': indices[2000:], 'key2':indices2[2000:], + 'value2' : np.random.randn(8000)}) + +right2 = right.append(right, ignore_index=True) + + +join_methods = ['inner', 'outer', 'left', 'right'] +results = DataFrame(index=join_methods, columns=[False]) +niter = 10 +for sort in [False]: + for join_method in join_methods: + f = lambda: merge(left, right, how=join_method, sort=sort) + gc.disable() + start = time.time() + for _ in xrange(niter): + f() + elapsed = (time.time() - start) / niter + gc.enable() + results[sort][join_method] = elapsed +results.columns = ['pandas'] +# results.columns = ['dont_sort', 'sort'] + + +# R results +from StringIO import StringIO +# many to one +r_results = read_table(StringIO(""" base::merge plyr data.table +inner 0.2475 0.1183 0.1100 +outer 0.4213 0.1916 0.2090 +left 0.2998 0.1188 0.0572 +right 0.3102 0.0536 0.0376 +"""), sep='\s+') + +all_results = results.join(r_results) + +all_results = all_results.div(all_results['pandas'], axis=0) + +all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', 'base::merge']] + +sort_results = DataFrame.from_items([('pandas', results['sort']), + ('R', r_results['sort'])]) +sort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] + + +nosort_results = DataFrame.from_items([('pandas', results['dont_sort']), + ('R', r_results['dont_sort'])]) +nosort_results['Ratio'] = sort_results['R'] / sort_results['pandas'] + +# many to many + +from StringIO import StringIO +# many to one +r_results = read_table(StringIO("""base::merge plyr data.table +inner 0.4610 0.1276 0.1269 +outer 0.9195 0.1881 0.2725 +left 0.6559 0.1257 0.0678 +right 0.6425 0.0522 0.0428 +"""), sep='\s+') + +all_results = results.join(r_results) +all_results = all_results.div(all_results['pandas'], axis=0) +all_results = all_results.ix[:, ['pandas', 'data.table', 'plyr', 'base::merge']] diff --git a/bench/bench_merge_sqlite.py b/bench/bench_merge_sqlite.py new file mode 100644 index 00000000..14a5288e --- /dev/null +++ b/bench/bench_merge_sqlite.py @@ -0,0 +1,84 @@ +import numpy as np +from collections import defaultdict +import gc +import time +from pandas import DataFrame +from pandas.util.testing import rands +import random + +N = 10000 + +indices = np.array([rands(10) for _ in xrange(N)], dtype='O') +indices2 = np.array([rands(10) for _ in xrange(N)], dtype='O') +key = np.tile(indices[:8000], 10) +key2 = np.tile(indices2[:8000], 10) + +left = DataFrame({'key' : key, 'key2':key2, + 'value' : np.random.randn(80000)}) +right = DataFrame({'key': indices[2000:], 'key2':indices2[2000:], + 'value2' : np.random.randn(8000)}) + +# right2 = right.append(right, ignore_index=True) +# right = right2 + +# random.shuffle(key2) +# indices2 = indices.copy() +# random.shuffle(indices2) + +# Prepare Database +import sqlite3 +create_sql_indexes = True + +conn = sqlite3.connect(':memory:') +conn.execute('create table left( key varchar(10), key2 varchar(10), value int);') +conn.execute('create table right( key varchar(10), key2 varchar(10), value2 int);') +conn.executemany('insert into left values (?, ?, ?)', + zip(key, key2, left['value'])) +conn.executemany('insert into right values (?, ?, ?)', + zip(right['key'], right['key2'], right['value2'])) + +# Create Indices +if create_sql_indexes: + conn.execute('create index left_ix on left(key, key2)') + conn.execute('create index right_ix on right(key, key2)') + + +join_methods = ['inner', 'left outer', 'left'] # others not supported +sql_results = DataFrame(index=join_methods, columns=[False]) +niter = 5 +for sort in [False]: + for join_method in join_methods: + sql = """CREATE TABLE test as select * + from left + %s join right + on left.key=right.key + and left.key2 = right.key2;""" % join_method + sql = """select * + from left + %s join right + on left.key=right.key + and left.key2 = right.key2;""" % join_method + + if sort: + sql = '%s order by key, key2' % sql + f = lambda: list(conn.execute(sql)) # list fetches results + g = lambda: conn.execute(sql) # list fetches results + gc.disable() + start = time.time() + # for _ in xrange(niter): + g() + elapsed = (time.time() - start) / niter + gc.enable() + + cur = conn.execute("DROP TABLE test") + conn.commit() + + sql_results[sort][join_method] = elapsed +sql_results.columns = ['sqlite3'] # ['dont_sort', 'sort'] +sql_results.index = ['inner', 'outer', 'left'] + + sql = """select * + from left + inner join right + on left.key=right.key + and left.key2 = right.key2;""" diff --git a/bench/bench_pivot.R b/bench/bench_pivot.R new file mode 100644 index 00000000..06dc6a10 --- /dev/null +++ b/bench/bench_pivot.R @@ -0,0 +1,27 @@ +library(reshape2) + + +n <- 100000 +a.size <- 5 +b.size <- 5 + +data <- data.frame(a=sample(letters[1:a.size], n, replace=T), + b=sample(letters[1:b.size], n, replace=T), + c=rnorm(n), + d=rnorm(n)) + +timings <- numeric() + +# acast(melt(data, id=c("a", "b")), a ~ b, mean) +# acast(melt(data, id=c("a", "b")), a + b ~ variable, mean) + +for (i in 1:10) { + gc() + tim <- system.time(acast(melt(data, id=c("a", "b")), a ~ b, mean, + subset=.(variable=="c"))) + timings[i] = tim[3] +} + +mean(timings) + +acast(melt(data, id=c("a", "b")), a ~ b, mean, subset=.(variable="c")) diff --git a/bench/bench_pivot.py b/bench/bench_pivot.py new file mode 100644 index 00000000..007bd0aa --- /dev/null +++ b/bench/bench_pivot.py @@ -0,0 +1,16 @@ +from pandas import * +import string + + +n = 100000 +asize = 5 +bsize = 5 + +letters = np.asarray(list(string.letters), dtype=object) + +data = DataFrame(dict(foo=letters[:asize][np.random.randint(0, asize, n)], + bar=letters[:bsize][np.random.randint(0, bsize, n)], + baz=np.random.randn(n), + qux=np.random.randn(n))) + +table = pivot_table(data, xby=['foo', 'bar']) diff --git a/bench/bench_sparse.py b/bench/bench_sparse.py new file mode 100644 index 00000000..40034152 --- /dev/null +++ b/bench/bench_sparse.py @@ -0,0 +1,92 @@ +import sys +import numpy as np + +from pandas import * +import pandas.core.sparse as spm +reload(spm) +from pandas.core.sparse import * + +N = 10000. + +arr1 = np.arange(N) +index = Index(np.arange(N)) + +off = N//10 +arr1[off : 2 * off] = np.NaN +arr1[4*off: 5 * off] = np.NaN +arr1[8*off: 9 * off] = np.NaN + +arr2 = np.arange(N) +arr2[3 * off // 2: 2 * off + off // 2] = np.NaN +arr2[8 * off + off // 2: 9 * off + off // 2] = np.NaN + +s1 = SparseSeries(arr1, index=index) +s2 = SparseSeries(arr2, index=index) + +is1 = SparseSeries(arr1, kind='integer', index=index) +is2 = SparseSeries(arr2, kind='integer', index=index) + +s1_dense = s1.to_dense() +s2_dense = s2.to_dense() + +if 'linux' in sys.platform: + pth = '/home/wesm/code/pandas/example' +else: + pth = '/Users/wesm/code/pandas/example' + +dm = DataFrame.load(pth) + +sdf = dm.to_sparse() + +def new_data_like(sdf): + new_data = {} + for col, series in sdf.iteritems(): + new_data[col] = SparseSeries(np.random.randn(len(series.sp_values)), + index=sdf.index, + sparse_index=series.sp_index, + fill_value=series.fill_value) + + return SparseDataFrame(new_data) + +# data = {} +# for col, ser in dm.iteritems(): +# data[col] = SparseSeries(ser) + +dwp = Panel.fromDict({'foo' : dm}) +# sdf = SparseDataFrame(data) + + +lp = stack_sparse_frame(sdf) + + +swp = SparsePanel({'A' : sdf}) +swp = SparsePanel({'A' : sdf, + 'B' : sdf, + 'C' : sdf, + 'D' : sdf}) + +y = sdf +x = SparsePanel({'x1' : sdf + new_data_like(sdf) / 10, + 'x2' : sdf + new_data_like(sdf) / 10}) + +dense_y = sdf +dense_x = x.to_dense() + +# import hotshot, hotshot.stats +# prof = hotshot.Profile('test.prof') + +# benchtime, stones = prof.runcall(ols, y=y, x=x) + +# prof.close() + +# stats = hotshot.stats.load('test.prof') + +dense_model = ols(y=dense_y, x=dense_x) + +import pandas.stats.plm as plm +import pandas.stats.interface as face +reload(plm) +reload(face) + +# model = face.ols(y=y, x=x) + diff --git a/bench/bench_take_indexing.py b/bench/bench_take_indexing.py new file mode 100644 index 00000000..fc8a3c6b --- /dev/null +++ b/bench/bench_take_indexing.py @@ -0,0 +1,52 @@ +import numpy as np + +from pandas import * +import pandas._tseries as lib + +from pandas import DataFrame +import timeit + +setup = """ +from pandas import Series +import pandas._tseries as lib +import random +import numpy as np + +import random +n = %d +k = %d +arr = np.random.randn(n, k) +indexer = np.arange(n, dtype=np.int32) +indexer = indexer[::-1] +""" + +sizes = [100, 1000, 10000, 100000] +iters = [1000, 1000, 100, 1] + +fancy_2d = [] +take_2d = [] +cython_2d = [] + +n = 1000 + +def _timeit(stmt, size, k=5, iters=1000): + timer = timeit.Timer(stmt=stmt, setup=setup % (sz, k)) + return timer.timeit(n) / n + +for sz, its in zip(sizes, iters): + print sz + fancy_2d.append(_timeit('arr[indexer]', sz, iters=its)) + take_2d.append(_timeit('arr.take(indexer, axis=0)', sz, iters=its)) + cython_2d.append(_timeit('lib.take_axis0(arr, indexer)', sz, iters=its)) + +df = DataFrame({'fancy' : fancy_2d, + 'take' : take_2d, + 'cython' : cython_2d}) + +print df + +from pandas.rpy.common import r +r('mat <- matrix(rnorm(50000), nrow=10000, ncol=5)') +r('set.seed(12345') +r('indexer <- sample(1:10000)') +r('mat[indexer,]') diff --git a/bench/bench_unique.py b/bench/bench_unique.py new file mode 100644 index 00000000..3b5ece66 --- /dev/null +++ b/bench/bench_unique.py @@ -0,0 +1,264 @@ +from pandas import * +from pandas.util.testing import rands +import pandas._tseries as lib +import numpy as np +import matplotlib.pyplot as plt + +N = 50000 +K = 10000 + +groups = np.array([rands(10) for _ in xrange(K)], dtype='O') +groups2 = np.array([rands(10) for _ in xrange(K)], dtype='O') + +labels = np.tile(groups, N // K) +labels2 = np.tile(groups2, N // K) +data = np.random.randn(N) + +def timeit(f, niter): + import gc, time + gc.disable() + start = time.time() + for _ in xrange(niter): + f() + elapsed = (time.time() - start) / niter + gc.enable() + return elapsed + +def algo1(): + unique_labels = np.unique(labels) + result = np.empty(len(unique_labels)) + for i, label in enumerate(unique_labels): + result[i] = data[labels == label].sum() + +def algo2(): + unique_labels = np.unique(labels) + indices = lib.groupby_indices(labels) + result = np.empty(len(unique_labels)) + + for i, label in enumerate(unique_labels): + result[i] = data.take(indices[label]).sum() + +def algo3_nosort(): + rizer = lib.DictFactorizer() + labs, counts = rizer.factorize(labels, sort=False) + k = len(rizer.uniques) + out = np.empty(k) + lib.group_add(out, counts, data, labs) + +def algo3_sort(): + rizer = lib.DictFactorizer() + labs, counts = rizer.factorize(labels, sort=True) + k = len(rizer.uniques) + out = np.empty(k) + lib.group_add(out, counts, data, labs) + +import numpy as np +import random + + +# dict to hold results +counts = {} + +# a hack to generate random key, value pairs. +# 5k keys, 100k values +x = np.tile(np.arange(5000, dtype='O'), 20) +random.shuffle(x) +xarr = x +x = [int(y) for y in x] +data = np.random.uniform(0, 1, 100000) + +def f(): + from itertools import izip + # groupby sum + for k, v in izip(x, data): + try: + counts[k] += v + except KeyError: + counts[k] = v + +def f2(): + rizer = lib.DictFactorizer() + labs, counts = rizer.factorize(xarr, sort=False) + k = len(rizer.uniques) + out = np.empty(k) + lib.group_add(out, counts, data, labs) + +def algo4(): + rizer = lib.DictFactorizer() + labs1, _ = rizer.factorize(labels, sort=False) + k1 = len(rizer.uniques) + + rizer = lib.DictFactorizer() + labs2, _ = rizer.factorize(labels2, sort=False) + k2 = len(rizer.uniques) + + group_id = labs1 * k2 + labs2 + max_group = k1 * k2 + + if max_group > 1e6: + rizer = lib.Int64Factorizer(len(group_id)) + group_id, _ = rizer.factorize(group_id.astype('i8'), sort=True) + max_group = len(rizer.uniques) + + out = np.empty(max_group) + counts = np.zeros(max_group, dtype='i4') + lib.group_add(out, counts, data, group_id) + +# cumtime percall filename:lineno(function) +# 0.592 0.592 :1() + # 0.584 0.006 groupby_ex.py:37(algo3_nosort) + # 0.535 0.005 {method 'factorize' of DictFactorizer' objects} + # 0.047 0.000 {pandas._tseries.group_add} + # 0.002 0.000 numeric.py:65(zeros_like) + # 0.001 0.000 {method 'fill' of 'numpy.ndarray' objects} + # 0.000 0.000 {numpy.core.multiarray.empty_like} + # 0.000 0.000 {numpy.core.multiarray.empty} + +# UNIQUE timings + +# N = 10000000 +# K = 500000 + +# groups = np.array([rands(10) for _ in xrange(K)], dtype='O') + +# labels = np.tile(groups, N // K) +data = np.random.randn(N) + +data = np.random.randn(N) + +Ks = [100, 1000, 5000, 10000, 25000, 50000, 100000] + +# Ks = [500000, 1000000, 2500000, 5000000, 10000000] + +import psutil +import os +import gc + +pid = os.getpid() +proc = psutil.Process(pid) + +def dict_unique(values, expected_K, sort=False, memory=False): + if memory: + gc.collect() + before_mem = proc.get_memory_info().rss + + rizer = lib.DictFactorizer() + result = rizer.unique_int64(values) + + if memory: + result = proc.get_memory_info().rss - before_mem + return result + + if sort: + result.sort() + assert(len(result) == expected_K) + return result + +def khash_unique(values, expected_K, size_hint=False, sort=False, + memory=False): + if memory: + gc.collect() + before_mem = proc.get_memory_info().rss + + if size_hint: + rizer = lib.Factorizer(len(values)) + else: + rizer = lib.Factorizer(100) + + result = [] + result = rizer.unique(values) + + if memory: + result = proc.get_memory_info().rss - before_mem + return result + + if sort: + result.sort() + assert(len(result) == expected_K) + +def khash_unique_str(values, expected_K, size_hint=False, sort=False, + memory=False): + if memory: + gc.collect() + before_mem = proc.get_memory_info().rss + + if size_hint: + rizer = lib.StringHashTable(len(values)) + else: + rizer = lib.StringHashTable(100) + + result = [] + result = rizer.unique(values) + + if memory: + result = proc.get_memory_info().rss - before_mem + return result + + if sort: + result.sort() + assert(len(result) == expected_K) + +def khash_unique_int64(values, expected_K, size_hint=False, sort=False): + if size_hint: + rizer = lib.Int64HashTable(len(values)) + else: + rizer = lib.Int64HashTable(100) + + result = [] + result = rizer.unique(values) + + if sort: + result.sort() + assert(len(result) == expected_K) + +def hash_bench(): + numpy = [] + dict_based = [] + dict_based_sort = [] + khash_hint = [] + khash_nohint = [] + for K in Ks: + print K + # groups = np.array([rands(10) for _ in xrange(K)]) + # labels = np.tile(groups, N // K).astype('O') + + groups = np.random.randint(0, 100000000000L, size=K) + labels = np.tile(groups, N // K) + dict_based.append(timeit(lambda: dict_unique(labels, K), 20)) + khash_nohint.append(timeit(lambda: khash_unique_int64(labels, K), 20)) + khash_hint.append(timeit(lambda: khash_unique_int64(labels, K, + size_hint=True), 20)) + + # memory, hard to get + # dict_based.append(np.mean([dict_unique(labels, K, memory=True) + # for _ in xrange(10)])) + # khash_nohint.append(np.mean([khash_unique(labels, K, memory=True) + # for _ in xrange(10)])) + # khash_hint.append(np.mean([khash_unique(labels, K, size_hint=True, memory=True) + # for _ in xrange(10)])) + + # dict_based_sort.append(timeit(lambda: dict_unique(labels, K, + # sort=True), 10)) + # numpy.append(timeit(lambda: np.unique(labels), 10)) + + # unique_timings = DataFrame({'numpy.unique' : numpy, + # 'dict, no sort' : dict_based, + # 'dict, sort' : dict_based_sort}, + # columns=['dict, no sort', + # 'dict, sort', 'numpy.unique'], + # index=Ks) + + unique_timings = DataFrame({'dict' : dict_based, + 'khash, preallocate' : khash_hint, + 'khash' : khash_nohint}, + columns=['khash, preallocate', 'khash', 'dict'], + index=Ks) + + unique_timings.plot(kind='bar', legend=False) + plt.legend(loc='best') + plt.title('Unique on 100,000 values, int64') + plt.xlabel('Number of unique labels') + plt.ylabel('Mean execution time') + + + plt.show() diff --git a/bench/better_unique.py b/bench/better_unique.py new file mode 100644 index 00000000..9ff4823c --- /dev/null +++ b/bench/better_unique.py @@ -0,0 +1,76 @@ +from pandas import DataFrame +import timeit + +setup = """ +from pandas import Series +import pandas._tseries as _tseries +import random +import numpy as np + +def better_unique(values): + uniques = _tseries.fast_unique(values) + id_map = _tseries.map_indices_buf(uniques) + labels = _tseries.get_unique_labels(values, id_map) + return uniques, labels + +tot = 100000 + +def get_test_data(ngroups=100, n=tot): + unique_groups = range(ngroups) + random.shuffle(unique_groups) + arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], + dtype=object) + + return arr + +arr = get_test_data(ngroups=%d) +""" + +group_sizes = [10, 100, 1000, 10000, + 20000, 30000, 40000, + 50000, 60000, 70000, + 80000, 90000, 100000] + +numbers = [100, 100, 50] + [10] * 10 + +numpy = [] +wes = [] + +for sz, n in zip(group_sizes, numbers): + # wes_timer = timeit.Timer(stmt='better_unique(arr)', + # setup=setup % sz) + wes_timer = timeit.Timer(stmt='_tseries.fast_unique(arr)', + setup=setup % sz) + + numpy_timer = timeit.Timer(stmt='np.unique(arr)', + setup=setup % sz) + + print n + numpy_result = numpy_timer.timeit(number=n) / n + wes_result = wes_timer.timeit(number=n) / n + + print 'Groups: %d, NumPy: %s, Wes: %s' % (sz, numpy_result, wes_result) + + wes.append(wes_result) + numpy.append(numpy_result) + +result = DataFrame({'wes' : wes, 'numpy' : numpy}, index=group_sizes) + +def make_plot(numpy, wes): + pass + +# def get_test_data(ngroups=100, n=100000): +# unique_groups = range(ngroups) +# random.shuffle(unique_groups) +# arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + +# if len(arr) < n: +# arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], +# dtype=object) + +# return arr + +# arr = get_test_data(ngroups=1000) diff --git a/bench/duplicated.R b/bench/duplicated.R new file mode 100644 index 00000000..eb2376df --- /dev/null +++ b/bench/duplicated.R @@ -0,0 +1,22 @@ +N <- 100000 + +k1 = rep(NA, N) +k2 = rep(NA, N) +for (i in 1:N){ + k1[i] <- paste(sample(letters, 1), collapse="") + k2[i] <- paste(sample(letters, 1), collapse="") +} +df <- data.frame(a=k1, b=k2, c=rep(1:100, N / 100)) +df2 <- data.frame(a=k1, b=k2) + +timings <- numeric() +timings2 <- numeric() +for (i in 1:50) { + gc() + timings[i] = system.time(deduped <- df[!duplicated(df),])[3] + gc() + timings2[i] = system.time(deduped <- df[!duplicated(df[,c("a", "b")]),])[3] +} + +mean(timings) +mean(timings2) diff --git a/bench/io_roundtrip.py b/bench/io_roundtrip.py new file mode 100644 index 00000000..6b86d2a6 --- /dev/null +++ b/bench/io_roundtrip.py @@ -0,0 +1,107 @@ +import time, os +import numpy as np + +import la +import pandas +from pandas import datetools, DateRange + +def timeit(f, iterations): + start = time.clock() + + for i in xrange(iterations): + f() + + return time.clock() - start + +def rountrip_archive(N, K=50, iterations=10): + # Create data + arr = np.random.randn(N, K) + # lar = la.larry(arr) + dma = pandas.DataFrame(arr, + DateRange('1/1/2000', periods=N, + offset=datetools.Minute())) + dma[201] = 'bar' + + # filenames + filename_numpy = '/Users/wesm/tmp/numpy.npz' + filename_larry = '/Users/wesm/tmp/archive.hdf5' + filename_pandas = '/Users/wesm/tmp/pandas_tmp' + + # Delete old files + try: + os.unlink(filename_numpy) + except: + pass + try: + os.unlink(filename_larry) + except: + pass + + try: + os.unlink(filename_pandas) + except: + pass + + # Time a round trip save and load + # numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) + # numpy_time = timeit(numpy_f, iterations) / iterations + + # larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) + # larry_time = timeit(larry_f, iterations) / iterations + + pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) + pandas_time = timeit(pandas_f, iterations) / iterations + print 'pandas (HDF5) %7.4f seconds' % pandas_time + + pickle_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) + pickle_time = timeit(pickle_f, iterations) / iterations + print 'pandas (pickle) %7.4f seconds' % pickle_time + + # print 'Numpy (npz) %7.4f seconds' % numpy_time + # print 'larry (HDF5) %7.4f seconds' % larry_time + + # Delete old files + try: + os.unlink(filename_numpy) + except: + pass + try: + os.unlink(filename_larry) + except: + pass + + try: + os.unlink(filename_pandas) + except: + pass + +def numpy_roundtrip(filename, arr1, arr2): + np.savez(filename, arr1=arr1, arr2=arr2) + npz = np.load(filename) + arr1 = npz['arr1'] + arr2 = npz['arr2'] + +def larry_roundtrip(filename, lar1, lar2): + io = la.IO(filename) + io['lar1'] = lar1 + io['lar2'] = lar2 + lar1 = io['lar1'] + lar2 = io['lar2'] + +def pandas_roundtrip(filename, dma1, dma2): + # What's the best way to code this? + from pandas.io.pytables import HDFStore + store = HDFStore(filename) + store['dma1'] = dma1 + store['dma2'] = dma2 + dma1 = store['dma1'] + dma2 = store['dma2'] + +def pandas_roundtrip_pickle(filename, dma1, dma2): + dma1.save(filename) + dma1 = pandas.DataFrame.load(filename) + dma2.save(filename) + dma2 = pandas.DataFrame.load(filename) + +if __name__ == '__main__': + rountrip_archive(10000, K=200) diff --git a/bench/larry.py b/bench/larry.py new file mode 100644 index 00000000..e69de29b diff --git a/bench/serialize.py b/bench/serialize.py new file mode 100644 index 00000000..29eecfc4 --- /dev/null +++ b/bench/serialize.py @@ -0,0 +1,80 @@ +import time, os +import numpy as np + +import la +import pandas + +def timeit(f, iterations): + start = time.clock() + + for i in xrange(iterations): + f() + + return time.clock() - start + +def roundtrip_archive(N, iterations=10): + + # Create data + arr = np.random.randn(N, N) + lar = la.larry(arr) + dma = pandas.DataFrame(arr, range(N), range(N)) + + # filenames + filename_numpy = '/Users/wesm/tmp/numpy.npz' + filename_larry = '/Users/wesm/tmp/archive.hdf5' + filename_pandas = '/Users/wesm/tmp/pandas_tmp' + + # Delete old files + try: + os.unlink(filename_numpy) + except: + pass + try: + os.unlink(filename_larry) + except: + pass + try: + os.unlink(filename_pandas) + except: + pass + + # Time a round trip save and load + numpy_f = lambda: numpy_roundtrip(filename_numpy, arr, arr) + numpy_time = timeit(numpy_f, iterations) / iterations + + larry_f = lambda: larry_roundtrip(filename_larry, lar, lar) + larry_time = timeit(larry_f, iterations) / iterations + + pandas_f = lambda: pandas_roundtrip(filename_pandas, dma, dma) + pandas_time = timeit(pandas_f, iterations) / iterations + + print 'Numpy (npz) %7.4f seconds' % numpy_time + print 'larry (HDF5) %7.4f seconds' % larry_time + print 'pandas (HDF5) %7.4f seconds' % pandas_time + +def numpy_roundtrip(filename, arr1, arr2): + np.savez(filename, arr1=arr1, arr2=arr2) + npz = np.load(filename) + arr1 = npz['arr1'] + arr2 = npz['arr2'] + +def larry_roundtrip(filename, lar1, lar2): + io = la.IO(filename) + io['lar1'] = lar1 + io['lar2'] = lar2 + lar1 = io['lar1'] + lar2 = io['lar2'] + +def pandas_roundtrip(filename, dma1, dma2): + from pandas.io.pytables import HDFStore + store = HDFStore(filename) + store['dma1'] = dma1 + store['dma2'] = dma2 + dma1 = store['dma1'] + dma2 = store['dma2'] + +def pandas_roundtrip_pickle(filename, dma1, dma2): + dma1.save(filename) + dma1 = pandas.DataFrame.load(filename) + dma2.save(filename) + dma2 = pandas.DataFrame.load(filename) diff --git a/bench/test.py b/bench/test.py new file mode 100644 index 00000000..7fdf94fd --- /dev/null +++ b/bench/test.py @@ -0,0 +1,65 @@ +import numpy as np +import itertools +import collections +import scipy.ndimage as ndi + +N = 10000 + +lat = np.random.randint(0, 360, N) +lon = np.random.randint(0, 360, N) +data = np.random.randn(N) + +def groupby1(lat, lon, data): + indexer = np.lexsort((lon, lat)) + lat = lat.take(indexer) + lon = lon.take(indexer) + sorted_data = data.take(indexer) + + keys = 1000. * lat + lon + unique_keys = np.unique(keys) + bounds = keys.searchsorted(unique_keys) + + result = group_agg(sorted_data, bounds, lambda x: x.mean()) + + decoder = keys.searchsorted(unique_keys) + + return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) + +def group_mean(lat, lon, data): + indexer = np.lexsort((lon, lat)) + lat = lat.take(indexer) + lon = lon.take(indexer) + sorted_data = data.take(indexer) + + keys = 1000 * lat + lon + unique_keys = np.unique(keys) + + result = ndi.mean(sorted_data, labels=keys, index=unique_keys) + decoder = keys.searchsorted(unique_keys) + + return dict(zip(zip(lat.take(decoder), lon.take(decoder)), result)) + +def group_mean_naive(lat, lon, data): + grouped = collections.defaultdict(list) + for lt, ln, da in zip(lat, lon, data): + grouped[(lt, ln)].append(da) + + averaged = dict((ltln, np.mean(da)) for ltln, da in grouped.items()) + + return averaged + +def group_agg(values, bounds, f): + N = len(values) + result = np.empty(len(bounds), dtype=float) + for i, left_bound in enumerate(bounds): + if i == len(bounds) - 1: + right_bound = N + else: + right_bound = bounds[i + 1] + + result[i] = f(values[left_bound : right_bound]) + + return result + +# for i in range(10): +# groupby1(lat, lon, data) diff --git a/bench/zoo_bench.R b/bench/zoo_bench.R new file mode 100644 index 00000000..294d55f5 --- /dev/null +++ b/bench/zoo_bench.R @@ -0,0 +1,71 @@ +library(zoo) +library(xts) +library(fts) +library(tseries) +library(its) +library(xtable) + +## indices = rep(NA, 100000) +## for (i in 1:100000) +## indices[i] <- paste(sample(letters, 10), collapse="") + + + +## x <- zoo(rnorm(100000), indices) +## y <- zoo(rnorm(90000), indices[sample(1:100000, 90000)]) + +## indices <- as.POSIXct(1:100000) + +indices <- as.POSIXct(Sys.Date()) + seq(1, 100000000, 100) + +sz <- 500000 + +## x <- xts(rnorm(sz), sample(indices, sz)) +## y <- xts(rnorm(sz), sample(indices, sz)) + +zoo.bench <- function(){ + x <- zoo(rnorm(sz), sample(indices, sz)) + y <- zoo(rnorm(sz), sample(indices, sz)) + timeit(function() {x + y}) +} + +xts.bench <- function(){ + x <- xts(rnorm(sz), sample(indices, sz)) + y <- xts(rnorm(sz), sample(indices, sz)) + timeit(function() {x + y}) +} + +fts.bench <- function(){ + x <- fts(rnorm(sz), sort(sample(indices, sz))) + y <- fts(rnorm(sz), sort(sample(indices, sz)) + timeit(function() {x + y}) +} + +its.bench <- function(){ + x <- its(rnorm(sz), sort(sample(indices, sz))) + y <- its(rnorm(sz), sort(sample(indices, sz))) + timeit(function() {x + y}) +} + +irts.bench <- function(){ + x <- irts(sort(sample(indices, sz)), rnorm(sz)) + y <- irts(sort(sample(indices, sz)), rnorm(sz)) + timeit(function() {x + y}) +} + +timeit <- function(f){ + timings <- numeric() + for (i in 1:10) { + gc() + timings[i] = system.time(f())[3] + } + mean(timings) +} + +bench <- function(){ + results <- c(xts.bench(), fts.bench(), its.bench(), zoo.bench()) + names <- c("xts", "fts", "its", "zoo") + data.frame(results, names) +} + +result <- bench() diff --git a/bench/zoo_bench.py b/bench/zoo_bench.py new file mode 100644 index 00000000..450d659c --- /dev/null +++ b/bench/zoo_bench.py @@ -0,0 +1,35 @@ +from pandas import * +from pandas.util.testing import rands + +n = 1000000 +# indices = Index([rands(10) for _ in xrange(n)]) +def sample(values, k): + sampler = np.random.permutation(len(values)) + return values.take(sampler[:k]) +sz = 500000 +rng = np.arange(0, 10000000000000, 10000000) +stamps = np.datetime64(datetime.now()).view('i8') + rng +idx1 = np.sort(sample(stamps, sz)) +idx2 = np.sort(sample(stamps, sz)) +ts1 = Series(np.random.randn(sz), idx1) +ts2 = Series(np.random.randn(sz), idx2) + + +# subsample_size = 90000 + +# x = Series(np.random.randn(100000), indices) +# y = Series(np.random.randn(subsample_size), +# index=sample(indices, subsample_size)) + + +# lx = larry(np.random.randn(100000), [list(indices)]) +# ly = larry(np.random.randn(subsample_size), [list(y.index)]) + +# Benchmark 1: Two 1-million length time series (int64-based index) with +# randomly chosen timestamps + +# Benchmark 2: Join two 5-variate time series DataFrames (outer and inner join) + +# df1 = DataFrame(np.random.randn(1000000, 5), idx1, columns=range(5)) +# df2 = DataFrame(np.random.randn(1000000, 5), idx2, columns=range(5, 10)) + diff --git a/doc/data/baseball.csv b/doc/data/baseball.csv new file mode 100644 index 00000000..546c3ad6 --- /dev/null +++ b/doc/data/baseball.csv @@ -0,0 +1,101 @@ +id,year,stint,team,lg,g,ab,r,h,X2b,X3b,hr,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp +88641,womacto01,2006,2,CHN,NL,19,50,6,14,1,0,1,2.0,1.0,1.0,4,4.0,0.0,0.0,3.0,0.0,0.0 +88643,schilcu01,2006,1,BOS,AL,31,2,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +88645,myersmi01,2006,1,NYA,AL,62,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +88649,helliri01,2006,1,MIL,NL,20,3,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0 +88650,johnsra05,2006,1,NYA,AL,33,6,0,1,0,0,0,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,0.0 +88652,finlest01,2006,1,SFN,NL,139,426,66,105,21,12,6,40.0,7.0,0.0,46,55.0,2.0,2.0,3.0,4.0,6.0 +88653,gonzalu01,2006,1,ARI,NL,153,586,93,159,52,2,15,73.0,0.0,1.0,69,58.0,10.0,7.0,0.0,6.0,14.0 +88662,seleaa01,2006,1,LAN,NL,28,26,2,5,1,0,0,0.0,0.0,0.0,1,7.0,0.0,0.0,6.0,0.0,1.0 +89177,francju01,2007,2,ATL,NL,15,40,1,10,3,0,0,8.0,0.0,0.0,4,10.0,1.0,0.0,0.0,1.0,1.0 +89178,francju01,2007,1,NYN,NL,40,50,7,10,0,0,1,8.0,2.0,1.0,10,13.0,0.0,0.0,0.0,1.0,1.0 +89330,zaungr01,2007,1,TOR,AL,110,331,43,80,24,1,10,52.0,0.0,0.0,51,55.0,8.0,2.0,1.0,6.0,9.0 +89333,witasja01,2007,1,TBA,AL,3,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89334,williwo02,2007,1,HOU,NL,33,59,3,6,0,0,1,2.0,0.0,0.0,0,25.0,0.0,0.0,5.0,0.0,1.0 +89335,wickmbo01,2007,2,ARI,NL,8,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89336,wickmbo01,2007,1,ATL,NL,47,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89337,whitero02,2007,1,MIN,AL,38,109,8,19,4,0,4,20.0,0.0,0.0,6,19.0,0.0,3.0,0.0,1.0,2.0 +89338,whiteri01,2007,1,HOU,NL,20,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89339,wellsda01,2007,2,LAN,NL,7,15,2,4,1,0,0,1.0,0.0,0.0,0,6.0,0.0,0.0,0.0,0.0,0.0 +89340,wellsda01,2007,1,SDN,NL,22,38,1,4,0,0,0,0.0,0.0,0.0,0,12.0,0.0,0.0,4.0,0.0,0.0 +89341,weathda01,2007,1,CIN,NL,67,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89343,walketo04,2007,1,OAK,AL,18,48,5,13,1,0,0,4.0,0.0,0.0,2,4.0,0.0,0.0,0.0,2.0,2.0 +89345,wakefti01,2007,1,BOS,AL,1,2,0,0,0,0,0,0.0,0.0,0.0,0,2.0,0.0,0.0,0.0,0.0,0.0 +89347,vizquom01,2007,1,SFN,NL,145,513,54,126,18,3,4,51.0,14.0,6.0,44,48.0,6.0,1.0,14.0,3.0,14.0 +89348,villoro01,2007,1,NYA,AL,6,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89352,valenjo03,2007,1,NYN,NL,51,166,18,40,11,1,3,18.0,2.0,1.0,15,28.0,4.0,0.0,1.0,1.0,5.0 +89354,trachst01,2007,2,CHN,NL,4,7,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89355,trachst01,2007,1,BAL,AL,3,5,0,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0 +89359,timlimi01,2007,1,BOS,AL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89360,thomeji01,2007,1,CHA,AL,130,432,79,119,19,0,35,96.0,0.0,1.0,95,134.0,11.0,6.0,0.0,3.0,10.0 +89361,thomafr04,2007,1,TOR,AL,155,531,63,147,30,0,26,95.0,0.0,0.0,81,94.0,3.0,7.0,0.0,5.0,14.0 +89363,tavarju01,2007,1,BOS,AL,2,4,0,1,0,0,0,0.0,0.0,0.0,1,3.0,0.0,0.0,0.0,0.0,0.0 +89365,sweenma01,2007,2,LAN,NL,30,33,2,9,1,0,0,3.0,0.0,0.0,1,11.0,0.0,0.0,0.0,0.0,0.0 +89366,sweenma01,2007,1,SFN,NL,76,90,18,23,8,0,2,10.0,2.0,0.0,13,18.0,0.0,3.0,1.0,0.0,0.0 +89367,suppaje01,2007,1,MIL,NL,33,61,4,8,0,0,0,2.0,0.0,0.0,3,16.0,0.0,0.0,11.0,0.0,2.0 +89368,stinnke01,2007,1,SLN,NL,26,82,7,13,3,0,1,5.0,0.0,0.0,5,22.0,2.0,0.0,0.0,0.0,2.0 +89370,stantmi02,2007,1,CIN,NL,67,2,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89371,stairma01,2007,1,TOR,AL,125,357,58,103,28,1,21,64.0,2.0,1.0,44,66.0,5.0,2.0,0.0,2.0,7.0 +89372,sprinru01,2007,1,SLN,NL,72,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89374,sosasa01,2007,1,TEX,AL,114,412,53,104,24,1,21,92.0,0.0,0.0,34,112.0,3.0,3.0,0.0,5.0,11.0 +89375,smoltjo01,2007,1,ATL,NL,30,54,1,5,1,0,0,2.0,0.0,0.0,1,19.0,0.0,0.0,13.0,0.0,0.0 +89378,sheffga01,2007,1,DET,AL,133,494,107,131,20,1,25,75.0,22.0,5.0,84,71.0,2.0,9.0,0.0,6.0,10.0 +89381,seleaa01,2007,1,NYN,NL,31,4,0,0,0,0,0,0.0,0.0,0.0,1,1.0,0.0,0.0,1.0,0.0,0.0 +89382,seaneru01,2007,1,LAN,NL,68,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89383,schmija01,2007,1,LAN,NL,6,7,1,1,0,0,1,1.0,0.0,0.0,0,4.0,0.0,0.0,1.0,0.0,0.0 +89384,schilcu01,2007,1,BOS,AL,1,2,0,1,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89385,sandere02,2007,1,KCA,AL,24,73,12,23,7,0,2,11.0,0.0,1.0,11,15.0,0.0,1.0,0.0,0.0,2.0 +89388,rogerke01,2007,1,DET,AL,1,2,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89389,rodriiv01,2007,1,DET,AL,129,502,50,141,31,3,11,63.0,2.0,2.0,9,96.0,1.0,1.0,1.0,2.0,16.0 +89396,ramirma02,2007,1,BOS,AL,133,483,84,143,33,1,20,88.0,0.0,0.0,71,92.0,13.0,7.0,0.0,8.0,21.0 +89398,piazzmi01,2007,1,OAK,AL,83,309,33,85,17,1,8,44.0,0.0,0.0,18,61.0,0.0,0.0,0.0,2.0,9.0 +89400,perezne01,2007,1,DET,AL,33,64,5,11,3,0,1,6.0,0.0,0.0,4,8.0,0.0,0.0,3.0,0.0,2.0 +89402,parkch01,2007,1,NYN,NL,1,1,0,0,0,0,0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0 +89406,oliveda02,2007,1,LAA,AL,5,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89410,myersmi01,2007,1,NYA,AL,6,1,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89411,mussimi01,2007,1,NYA,AL,2,2,0,0,0,0,0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0 +89412,moyerja01,2007,1,PHI,NL,33,73,4,9,2,0,0,2.0,0.0,0.0,2,26.0,0.0,0.0,8.0,0.0,1.0 +89420,mesajo01,2007,1,PHI,NL,38,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89421,martipe02,2007,1,NYN,NL,5,9,1,1,1,0,0,0.0,0.0,0.0,0,6.0,0.0,0.0,2.0,0.0,0.0 +89425,maddugr01,2007,1,SDN,NL,33,62,2,9,2,0,0,0.0,1.0,0.0,1,19.0,0.0,0.0,9.0,0.0,2.0 +89426,mabryjo01,2007,1,COL,NL,28,34,4,4,1,0,1,5.0,0.0,0.0,5,10.0,0.0,0.0,0.0,0.0,1.0 +89429,loftoke01,2007,2,CLE,AL,52,173,24,49,9,3,0,15.0,2.0,3.0,17,23.0,0.0,0.0,4.0,2.0,1.0 +89430,loftoke01,2007,1,TEX,AL,84,317,62,96,16,3,7,23.0,21.0,4.0,39,28.0,1.0,2.0,2.0,3.0,5.0 +89431,loaizes01,2007,1,LAN,NL,5,7,0,1,0,0,0,2.0,0.0,0.0,0,2.0,0.0,0.0,2.0,0.0,1.0 +89438,kleskry01,2007,1,SFN,NL,116,362,51,94,27,3,6,44.0,5.0,1.0,46,68.0,2.0,1.0,1.0,1.0,14.0 +89439,kentje01,2007,1,LAN,NL,136,494,78,149,36,1,20,79.0,1.0,3.0,57,61.0,4.0,5.0,0.0,6.0,17.0 +89442,jonesto02,2007,1,DET,AL,5,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89445,johnsra05,2007,1,ARI,NL,10,15,0,1,0,0,0,0.0,0.0,0.0,1,7.0,0.0,0.0,2.0,0.0,0.0 +89450,hoffmtr01,2007,1,SDN,NL,60,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89451,hernaro01,2007,2,LAN,NL,22,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89452,hernaro01,2007,1,CLE,AL,2,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89460,guarded01,2007,1,CIN,NL,15,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89462,griffke02,2007,1,CIN,NL,144,528,78,146,24,1,30,93.0,6.0,1.0,85,99.0,14.0,1.0,0.0,9.0,14.0 +89463,greensh01,2007,1,NYN,NL,130,446,62,130,30,1,10,46.0,11.0,1.0,37,62.0,4.0,5.0,1.0,1.0,14.0 +89464,graffto01,2007,1,MIL,NL,86,231,34,55,8,0,9,30.0,0.0,1.0,24,44.0,6.0,3.0,0.0,2.0,7.0 +89465,gordoto01,2007,1,PHI,NL,44,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89466,gonzalu01,2007,1,LAN,NL,139,464,70,129,23,2,15,68.0,6.0,2.0,56,56.0,4.0,4.0,0.0,2.0,11.0 +89467,gomezch02,2007,2,CLE,AL,19,53,4,15,2,0,0,5.0,0.0,0.0,0,6.0,0.0,0.0,1.0,1.0,1.0 +89468,gomezch02,2007,1,BAL,AL,73,169,17,51,10,1,1,16.0,1.0,2.0,10,20.0,1.0,0.0,5.0,1.0,5.0 +89469,glavito02,2007,1,NYN,NL,33,56,3,12,1,0,0,4.0,0.0,0.0,6,5.0,0.0,0.0,12.0,1.0,0.0 +89473,floydcl01,2007,1,CHN,NL,108,282,40,80,10,1,9,45.0,0.0,0.0,35,47.0,5.0,5.0,0.0,0.0,6.0 +89474,finlest01,2007,1,COL,NL,43,94,9,17,3,0,1,2.0,0.0,0.0,8,4.0,1.0,0.0,0.0,0.0,2.0 +89480,embreal01,2007,1,OAK,AL,4,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89481,edmonji01,2007,1,SLN,NL,117,365,39,92,15,2,12,53.0,0.0,2.0,41,75.0,2.0,0.0,2.0,3.0,9.0 +89482,easleda01,2007,1,NYN,NL,76,193,24,54,6,0,10,26.0,0.0,1.0,19,35.0,1.0,5.0,0.0,1.0,2.0 +89489,delgaca01,2007,1,NYN,NL,139,538,71,139,30,0,24,87.0,4.0,0.0,52,118.0,8.0,11.0,0.0,6.0,12.0 +89493,cormirh01,2007,1,CIN,NL,6,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89494,coninje01,2007,2,NYN,NL,21,41,2,8,2,0,0,5.0,0.0,0.0,7,8.0,2.0,0.0,1.0,1.0,1.0 +89495,coninje01,2007,1,CIN,NL,80,215,23,57,11,1,6,32.0,4.0,0.0,20,28.0,0.0,0.0,1.0,6.0,4.0 +89497,clemero02,2007,1,NYA,AL,2,2,0,1,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89498,claytro01,2007,2,BOS,AL,8,6,1,0,0,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,2.0 +89499,claytro01,2007,1,TOR,AL,69,189,23,48,14,0,1,12.0,2.0,1.0,14,50.0,0.0,1.0,3.0,3.0,8.0 +89501,cirilje01,2007,2,ARI,NL,28,40,6,8,4,0,0,6.0,0.0,0.0,4,6.0,0.0,0.0,0.0,0.0,1.0 +89502,cirilje01,2007,1,MIN,AL,50,153,18,40,9,2,2,21.0,2.0,0.0,15,13.0,0.0,1.0,3.0,2.0,9.0 +89521,bondsba01,2007,1,SFN,NL,126,340,75,94,14,0,28,66.0,5.0,0.0,132,54.0,43.0,3.0,0.0,2.0,13.0 +89523,biggicr01,2007,1,HOU,NL,141,517,68,130,31,3,10,50.0,4.0,3.0,23,112.0,0.0,3.0,7.0,5.0,5.0 +89525,benitar01,2007,2,FLO,NL,34,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89526,benitar01,2007,1,SFN,NL,19,0,0,0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0 +89530,ausmubr01,2007,1,HOU,NL,117,349,38,82,16,3,3,25.0,6.0,1.0,37,74.0,3.0,6.0,4.0,1.0,11.0 +89533,aloumo01,2007,1,NYN,NL,87,328,51,112,19,1,13,49.0,3.0,0.0,27,30.0,5.0,2.0,0.0,3.0,13.0 +89534,alomasa02,2007,1,NYN,NL,8,22,1,3,1,0,0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,0.0 diff --git a/doc/data/iris.data b/doc/data/iris.data new file mode 100644 index 00000000..2953c6b5 --- /dev/null +++ b/doc/data/iris.data @@ -0,0 +1,152 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica + diff --git a/doc/data/mindex_ex.csv b/doc/data/mindex_ex.csv new file mode 100644 index 00000000..935ff936 --- /dev/null +++ b/doc/data/mindex_ex.csv @@ -0,0 +1,16 @@ +year,indiv,zit,xit +1977,"A",1.2,.6 +1977,"B",1.5,.5 +1977,"C",1.7,.8 +1978,"A",.2,.06 +1978,"B",.7,.2 +1978,"C",.8,.3 +1978,"D",.9,.5 +1978,"E",1.4,.9 +1979,"C",.2,.15 +1979,"D",.14,.05 +1979,"E",.5,.15 +1979,"F",1.2,.5 +1979,"G",3.4,1.9 +1979,"H",5.4,2.7 +1979,"I",6.4,1.2 diff --git a/doc/make.py b/doc/make.py new file mode 100755 index 00000000..345d8dcc --- /dev/null +++ b/doc/make.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python + +""" +Python script for building documentation. + +To build the docs you must have all optional dependencies for pandas +installed. See the installation instructions for a list of these. + +Note: currently latex builds do not work because of table formats that are not +supported in the latex generation. + +Usage +----- +python make.py clean +python make.py html +""" + +import glob +import os +import shutil +import sys +import sphinx + +os.environ['PYTHONPATH'] = '..' + +SPHINX_BUILD = 'sphinxbuild' + +def upload_dev(): + 'push a copy to the pydata dev directory' + if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'): + raise SystemExit('Upload to Pydata Dev failed') + +def upload_dev_pdf(): + 'push a copy to the pydata dev directory' + if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/dev/'): + raise SystemExit('PDF upload to Pydata Dev failed') + +def upload_stable(): + 'push a copy to the pydata stable directory' + if os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'): + raise SystemExit('Upload to stable failed') + +def upload_stable_pdf(): + 'push a copy to the pydata dev directory' + if os.system('cd build/latex; scp pandas.pdf pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/stable/'): + raise SystemExit('PDF upload to stable failed') + +def clean(): + if os.path.exists('build'): + shutil.rmtree('build') + + if os.path.exists('source/generated'): + shutil.rmtree('source/generated') + +def html(): + check_build() + if os.system('sphinx-build -P -b html -d build/doctrees ' + 'source build/html'): + raise SystemExit("Building HTML failed.") + +def latex(): + check_build() + if sys.platform != 'win32': + # LaTeX format. + if os.system('sphinx-build -b latex -d build/doctrees ' + 'source build/latex'): + raise SystemExit("Building LaTeX failed.") + # Produce pdf. + + os.chdir('build/latex') + + # Call the makefile produced by sphinx... + if os.system('make'): + raise SystemExit("Rendering LaTeX failed.") + + os.chdir('../..') + else: + print('latex build has not been tested on windows') + +def check_build(): + build_dirs = [ + 'build', 'build/doctrees', 'build/html', + 'build/latex', 'build/plots', 'build/_static', + 'build/_templates'] + for d in build_dirs: + try: + os.mkdir(d) + except OSError: + pass + +def all(): + # clean() + html() + +def auto_dev_build(debug=False): + msg = '' + try: + step = 'clean' + clean() + step = 'html' + html() + step = 'upload dev' + upload_dev() + if not debug: + sendmail(step) + + step = 'latex' + latex() + step = 'upload pdf' + upload_dev_pdf() + if not debug: + sendmail(step) + except (Exception, SystemExit), inst: + msg = str(inst) + '\n' + sendmail(step, '[ERROR] ' + msg) + +def sendmail(step=None, err_msg=None): + from_name, to_name = _get_config() + + if step is None: + step = '' + + if err_msg is None or '[ERROR]' not in err_msg: + msgstr = 'Daily docs %s completed successfully' % step + subject = "DOC: %s successful" % step + else: + msgstr = err_msg + subject = "DOC: %s failed" % step + + import smtplib + from email.MIMEText import MIMEText + msg = MIMEText(msgstr) + msg['Subject'] = subject + msg['From'] = from_name + msg['To'] = to_name + + server_str, port, login, pwd = _get_credentials() + server = smtplib.SMTP(server_str, port) + server.ehlo() + server.starttls() + server.ehlo() + + server.login(login, pwd) + try: + server.sendmail(from_name, to_name, msg.as_string()) + finally: + server.close() + +def _get_dir(): + import getpass + USERNAME = getpass.getuser() + if sys.platform == 'darwin': + HOME = '/Users/%s' % USERNAME + else: + HOME = '/home/%s' % USERNAME + + tmp_dir = '%s/tmp' % HOME + return tmp_dir + +def _get_credentials(): + tmp_dir = _get_dir() + cred = '%s/credentials' % tmp_dir + with open(cred, 'r') as fh: + server, port, un, domain = fh.read().split(',') + port = int(port) + login = un + '@' + domain + '.com' + + import base64 + with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: + pwd = base64.b64decode(fh.read()) + + return server, port, login, pwd + +def _get_config(): + tmp_dir = _get_dir() + with open('%s/config' % tmp_dir, 'r') as fh: + from_name, to_name = fh.read().split(',') + return from_name, to_name + +funcd = { + 'html' : html, + 'upload_dev' : upload_dev, + 'upload_stable' : upload_stable, + 'upload_dev_pdf' : upload_dev_pdf, + 'upload_stable_pdf' : upload_stable_pdf, + 'latex' : latex, + 'clean' : clean, + 'auto_dev' : auto_dev_build, + 'auto_debug' : lambda: auto_dev_build(True), + 'all' : all, + } + +small_docs = False + +# current_dir = os.getcwd() +# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) + +if len(sys.argv)>1: + for arg in sys.argv[1:]: + func = funcd.get(arg) + if func is None: + raise SystemExit('Do not know how to handle %s; valid args are %s'%( + arg, funcd.keys())) + func() +else: + small_docs = False + all() +#os.chdir(current_dir) diff --git a/doc/plots/stats/moment_plots.py b/doc/plots/stats/moment_plots.py new file mode 100644 index 00000000..7c8b6fb5 --- /dev/null +++ b/doc/plots/stats/moment_plots.py @@ -0,0 +1,29 @@ +import numpy as np + +import matplotlib.pyplot as plt +import pandas.util.testing as t +import pandas.stats.moments as m + +def test_series(n=1000): + t.N = n + s = t.makeTimeSeries() + return s + +def plot_timeseries(*args, **kwds): + n = len(args) + + fig, axes = plt.subplots(n, 1, figsize=kwds.get('size', (10, 5)), + sharex=True) + titles = kwds.get('titles', None) + + for k in range(1, n + 1): + ax = axes[k-1] + ts = args[k-1] + ax.plot(ts.index, ts.values) + + if titles: + ax.set_title(titles[k-1]) + + fig.autofmt_xdate() + fig.subplots_adjust(bottom=0.10, top=0.95) + diff --git a/doc/plots/stats/moments_ewma.py b/doc/plots/stats/moments_ewma.py new file mode 100644 index 00000000..3e521ed6 --- /dev/null +++ b/doc/plots/stats/moments_ewma.py @@ -0,0 +1,15 @@ +import matplotlib.pyplot as plt +import pandas.util.testing as t +import pandas.stats.moments as m + +t.N = 200 +s = t.makeTimeSeries().cumsum() + +plt.figure(figsize=(10, 5)) +plt.plot(s.index, s.values) +plt.plot(s.index, m.ewma(s, 20, min_periods=1).values) +f = plt.gcf() +f.autofmt_xdate() + +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_ewmvol.py b/doc/plots/stats/moments_ewmvol.py new file mode 100644 index 00000000..093f6286 --- /dev/null +++ b/doc/plots/stats/moments_ewmvol.py @@ -0,0 +1,23 @@ +import matplotlib.pyplot as plt +import pandas.util.testing as t +import pandas.stats.moments as m + +t.N = 500 +ts = t.makeTimeSeries() +ts[::100] = 20 + +s = ts.cumsum() + + +plt.figure(figsize=(10, 5)) +plt.plot(s.index, m.ewmvol(s, span=50, min_periods=1).values, color='b') +plt.plot(s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') + +plt.title('Exp-weighted std with shocks') +plt.legend(('Exp-weighted', 'Equal-weighted')) + +f = plt.gcf() +f.autofmt_xdate() + +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_expw.py b/doc/plots/stats/moments_expw.py new file mode 100644 index 00000000..699b6cce --- /dev/null +++ b/doc/plots/stats/moments_expw.py @@ -0,0 +1,33 @@ +from moment_plots import * + +np.random.seed(1) + +ts = test_series(500) * 10 + +# ts[::100] = 20 + +s = ts.cumsum() + +fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) + +ax0, ax1, ax2 = axes + +ax0.plot(s.index, s.values) +ax0.set_title('time series') + +ax1.plot(s.index, m.ewma(s, span=50, min_periods=1).values, color='b') +ax1.plot(s.index, m.rolling_mean(s, 50, min_periods=1).values, color='r') +ax1.set_title('rolling_mean vs. ewma') + +line1 = ax2.plot(s.index, m.ewmstd(s, span=50, min_periods=1).values, color='b') +line2 = ax2.plot(s.index, m.rolling_std(s, 50, min_periods=1).values, color='r') +ax2.set_title('rolling_std vs. ewmstd') + +fig.legend((line1, line2), + ('Exp-weighted', 'Equal-weighted'), + loc='upper right') +fig.autofmt_xdate() +fig.subplots_adjust(bottom=0.10, top=0.95) + +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_rolling.py b/doc/plots/stats/moments_rolling.py new file mode 100644 index 00000000..30a6c5f5 --- /dev/null +++ b/doc/plots/stats/moments_rolling.py @@ -0,0 +1,24 @@ +from moment_plots import * + +ts = test_series() +s = ts.cumsum() + +s[20:50] = np.NaN +s[120:150] = np.NaN +plot_timeseries(s, + m.rolling_count(s, 50), + m.rolling_sum(s, 50, min_periods=10), + m.rolling_mean(s, 50, min_periods=10), + m.rolling_std(s, 50, min_periods=10), + m.rolling_skew(s, 50, min_periods=10), + m.rolling_kurt(s, 50, min_periods=10), + size=(10, 12), + titles=('time series', + 'rolling_count', + 'rolling_sum', + 'rolling_mean', + 'rolling_std', + 'rolling_skew', + 'rolling_kurt')) +plt.show() +plt.close('all') diff --git a/doc/plots/stats/moments_rolling_binary.py b/doc/plots/stats/moments_rolling_binary.py new file mode 100644 index 00000000..ab6b7b1c --- /dev/null +++ b/doc/plots/stats/moments_rolling_binary.py @@ -0,0 +1,30 @@ +from moment_plots import * + +np.random.seed(1) + +ts = test_series() +s = ts.cumsum() +ts2 = test_series() +s2 = ts2.cumsum() + +s[20:50] = np.NaN +s[120:150] = np.NaN +fig, axes = plt.subplots(3, 1, figsize=(8, 10), sharex=True) + +ax0, ax1, ax2 = axes + +ax0.plot(s.index, s.values) +ax0.plot(s2.index, s2.values) +ax0.set_title('time series') + +ax1.plot(s.index, m.rolling_corr(s, s2, 50, min_periods=1).values) +ax1.set_title('rolling_corr') + +ax2.plot(s.index, m.rolling_cov(s, s2, 50, min_periods=1).values) +ax2.set_title('rolling_cov') + +fig.autofmt_xdate() +fig.subplots_adjust(bottom=0.10, top=0.95) + +plt.show() +plt.close('all') diff --git a/doc/source/_static/stub b/doc/source/_static/stub new file mode 100644 index 00000000..e69de29b diff --git a/doc/source/api.rst b/doc/source/api.rst new file mode 100644 index 00000000..5cad5191 --- /dev/null +++ b/doc/source/api.rst @@ -0,0 +1,457 @@ +.. currentmodule:: pandas +.. _api: + +************* +API Reference +************* + +.. _api.functions: + +General functions +----------------- + +Data manipulations +~~~~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.tools.pivot + +.. autosummary:: + :toctree: generated/ + + pivot_table + +.. currentmodule:: pandas.tools.merge + +.. autosummary:: + :toctree: generated/ + + merge + concat + +Pickling +~~~~~~~~ + +.. currentmodule:: pandas.core.common + +.. autosummary:: + :toctree: generated/ + + load + save + +File IO +~~~~~~~ + +.. currentmodule:: pandas.io.parsers + +.. autosummary:: + :toctree: generated/ + + read_table + read_csv + ExcelFile.parse + +HDFStore: PyTables (HDF5) +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. currentmodule:: pandas.io.pytables + +.. autosummary:: + :toctree: generated/ + + HDFStore.put + HDFStore.get + +Standard moving window functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. currentmodule:: pandas.stats.moments + +.. autosummary:: + :toctree: generated/ + + rolling_count + rolling_sum + rolling_mean + rolling_median + rolling_var + rolling_std + rolling_corr + rolling_cov + rolling_skew + rolling_kurt + rolling_apply + rolling_quantile + +Exponentially-weighted moving window functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + ewma + ewmstd + ewmvar + ewmcorr + ewmcov + +.. currentmodule:: pandas + +.. _api.series: + +Series +------ + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + * **index**: axis labels + +.. autosummary:: + :toctree: generated/ + + Series.values + Series.dtype + Series.isnull + Series.notnull + +Conversion / Constructors +~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + Series.__init__ + Series.astype + Series.copy + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.get + Series.ix + Series.__iter__ + Series.iteritems + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.add + Series.div + Series.mul + Series.sub + Series.combine + Series.combine_first + +Function application, GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.apply + Series.map + Series.groupby + +.. _api.series.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.autocorr + Series.clip + Series.clip_lower + Series.clip_upper + Series.corr + Series.count + Series.cumprod + Series.cumsum + Series.describe + Series.diff + Series.max + Series.mean + Series.median + Series.min + Series.prod + Series.quantile + Series.skew + Series.std + Series.sum + Series.var + Series.value_counts + +Reindexing / Selection / Label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.align + Series.drop + Series.reindex + Series.reindex_like + Series.rename + Series.select + Series.take + Series.truncate + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.dropna + Series.fillna + Series.interpolate + +Reshaping, sorting +~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.argsort + Series.order + Series.sort + Series.sort_index + Series.sortlevel + Series.unstack + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.append + +Time series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.asfreq + Series.asof + Series.shift + Series.first_valid_index + Series.last_valid_index + Series.weekday + +Plotting +~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.hist + Series.plot + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + Series.from_csv + Series.load + Series.save + Series.to_csv + Series.to_dict + Series.to_sparse + +.. _api.dataframe: + +DataFrame +--------- + +Attributes and underlying data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +**Axes** + + * **index**: row labels + * **columns**: column labels + +.. autosummary:: + :toctree: generated/ + + DataFrame.as_matrix + DataFrame.dtypes + DataFrame.get_dtype_counts + DataFrame.values + DataFrame.axes + DataFrame.ndim + DataFrame.shape + +Conversion / Constructors +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.__init__ + DataFrame.astype + DataFrame.copy + +Indexing, iteration +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.ix + DataFrame.insert + DataFrame.__iter__ + DataFrame.iteritems + DataFrame.pop + DataFrame.xs + +Binary operator functions +~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.add + DataFrame.div + DataFrame.mul + DataFrame.sub + DataFrame.radd + DataFrame.rdiv + DataFrame.rmul + DataFrame.rsub + DataFrame.combine + DataFrame.combineAdd + DataFrame.combine_first + DataFrame.combineMult + +Function application, GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.apply + DataFrame.applymap + DataFrame.groupby + +.. _api.dataframe.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.clip + DataFrame.clip_lower + DataFrame.clip_upper + DataFrame.corr + DataFrame.corrwith + DataFrame.count + DataFrame.cumprod + DataFrame.cumsum + DataFrame.describe + DataFrame.diff + DataFrame.mad + DataFrame.max + DataFrame.mean + DataFrame.median + DataFrame.min + DataFrame.prod + DataFrame.quantile + DataFrame.skew + DataFrame.sum + DataFrame.std + DataFrame.var + +Reindexing / Selection / Label manipulation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.add_prefix + DataFrame.add_suffix + DataFrame.align + DataFrame.drop + DataFrame.filter + DataFrame.reindex + DataFrame.reindex_like + DataFrame.rename + DataFrame.select + DataFrame.take + DataFrame.truncate + DataFrame.head + DataFrame.tail + +.. _api.dataframe.missing: + +Missing data handling +~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.dropna + DataFrame.fillna + +Reshaping, sorting, transposing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.sort_index + DataFrame.delevel + DataFrame.pivot + DataFrame.sortlevel + DataFrame.swaplevel + DataFrame.stack + DataFrame.unstack + DataFrame.T + DataFrame.transpose + +Combining / joining / merging +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.join + DataFrame.merge + DataFrame.append + +Time series-related +~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.asfreq + DataFrame.shift + DataFrame.first_valid_index + DataFrame.last_valid_index + +Plotting +~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.hist + DataFrame.plot + +Serialization / IO / Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. autosummary:: + :toctree: generated/ + + DataFrame.from_csv + DataFrame.from_records + DataFrame.to_csv + DataFrame.to_excel + DataFrame.to_dict + DataFrame.to_records + DataFrame.to_sparse + DataFrame.to_string + DataFrame.save + DataFrame.load + DataFrame.info + +.. _api.panel: + +Panel +----- + +.. _api.panel.stats: + +Computations / Descriptive Stats +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + diff --git a/doc/source/basics.rst b/doc/source/basics.rst new file mode 100644 index 00000000..261c236b --- /dev/null +++ b/doc/source/basics.rst @@ -0,0 +1,994 @@ +.. currentmodule:: pandas +.. _basics: + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + +***************************** +Essential basic functionality +***************************** + +Here we discuss a lot of the essential functionality common to the pandas data +structures. Here's how to create some of the objects used in the examples from +the previous section: + +.. ipython:: python + + index = date_range('1/1/2000', periods=8) + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + +.. _basics.head_tail: + +Head and Tail +------------- + +To view a small sample of a Series or DataFrame object, use the ``head`` and +``tail`` methods. The default number of elements to display is five, but you +may pass a custom number. + +.. ipython:: python + + long_series = Series(randn(1000)) + long_series.head() + long_series.tail(3) + +.. _basics.attrs: + +Attributes and the raw ndarray(s) +--------------------------------- + +pandas objects have a number of attributes enabling you to access the metadata + + * **shape**: gives the axis dimensions of the object, consistent with ndarray + * Axis labels + + * **Series**: *index* (only axis) + * **DataFrame**: *index* (rows) and *columns* + * **Panel**: *items*, *major_axis*, and *minor_axis* + +Note, **these attributes can be safely assigned to**! + +.. ipython:: python + + df[:2] + df.columns = [x.lower() for x in df.columns] + df + +To get the actual data inside a data structure, one need only access the +**values** property: + +.. ipython:: python + + s.values + df.values + wp.values + +If a DataFrame or Panel contains homogeneously-typed data, the ndarray can +actually be modified in-place, and the changes will be reflected in the data +structure. For heterogeneous data (e.g. some of the DataFrame's columns are not +all the same dtype), this will not be the case. The values attribute itself, +unlike the axis labels, cannot be assigned to. + +.. note:: + + When working with heterogeneous data, the dtype of the resulting ndarray + will be chosen to accommodate all of the data involved. For example, if + strings are involved, the result will be of object dtype. If there are only + floats and integers, the resulting array will be of float dtype. + +.. _basics.binop: + +Flexible binary operations +-------------------------- + +With binary operations between pandas data structures, there are two key points +of interest: + + * Broadcasting behavior between higher- (e.g. DataFrame) and + lower-dimensional (e.g. Series) objects. + * Missing data in computations + +We will demonstrate how to manage these issues independently, though they can +be handled simultaneously. + +Matching / broadcasting behavior +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DataFrame has the methods **add, sub, mul, div** and related functions **radd, +rsub, ...** for carrying out binary operations. For broadcasting behavior, +Series input is of primary interest. Using these functions, you can use to +either match on the *index* or *columns* via the **axis** keyword: + +.. ipython:: python + :suppress: + + d = {'one' : Series(randn(3), index=['a', 'b', 'c']), + 'two' : Series(randn(4), index=['a', 'b', 'c', 'd']), + 'three' : Series(randn(3), index=['b', 'c', 'd'])} + df = DataFrame(d) + +.. ipython:: python + + df + row = df.ix[1] + column = df['two'] + + df.sub(row, axis='columns') + df.sub(row, axis=1) + + df.sub(column, axis='index') + df.sub(column, axis=0) + +With Panel, describing the matching behavior is a bit more difficult, so +the arithmetic methods instead (and perhaps confusingly?) give you the option +to specify the *broadcast axis*. For example, suppose we wished to demean the +data over a particular axis. This can be accomplished by taking the mean over +an axis and broadcasting over the same axis: + +.. ipython:: python + + major_mean = wp.mean(axis='major') + major_mean + wp.sub(major_mean, axis='major') + +And similarly for axis="items" and axis="minor". + +.. note:: + + I could be convinced to make the **axis** argument in the DataFrame methods + match the broadcasting behavior of Panel. Though it would require a + transition period so users can change their code... + +Missing data / operations with fill values +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In Series and DataFrame (though not yet in Panel), the arithmetic functions +have the option of inputting a *fill_value*, namely a value to substitute when +at most one of the values at a location are missing. For example, when adding +two DataFrame objects, you may wish to treat NaN as 0 unless both DataFrames +are missing that value, in which case the result will be NaN (you can later +replace NaN with some other value using ``fillna`` if you wish). + +.. ipython:: python + :suppress: + + df2 = df.copy() + df2['three']['a'] = 1. + +.. ipython:: python + + df + df2 + df + df2 + df.add(df2, fill_value=0) + +Flexible Comparisons +~~~~~~~~~~~~~~~~~~~~ +Starting in v0.8, pandas introduced binary comparison methods eq, ne, lt, gt, +le, and ge to Series and DataFrame whose behavior is analogous to the binary +arithmetic operations described above: + +.. ipython:: python + + df.gt(df2) + + df2.ne(df) + +Combining overlapping data sets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A problem occasionally arising is the combination of two similar data sets +where values in one are preferred over the other. An example would be two data +series representing a particular economic indicator where one is considered to +be of "higher quality". However, the lower quality series might extend further +back in history or have more complete data coverage. As such, we would like to +combine two DataFrame objects where missing values in one DataFrame are +conditionally filled with like-labeled values from the other DataFrame. The +function implementing this operation is ``combine_first``, which we illustrate: + +.. ipython:: python + + df1 = DataFrame({'A' : [1., np.nan, 3., 5., np.nan], + 'B' : [np.nan, 2., 3., np.nan, 6.]}) + df2 = DataFrame({'A' : [5., 2., 4., np.nan, 3., 7.], + 'B' : [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 + df2 + df1.combine_first(df2) + +General DataFrame Combine +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``combine_first`` method above calls the more general DataFrame method +``combine``. This method takes another DataFrame and a combiner function, +aligns the input DataFrame and then passes the combiner function pairs of +Series (ie, columns whose names are the same). + +So, for instance, to reproduce ``combine_first`` as above: + +.. ipython:: python + + combiner = lambda x, y: np.where(isnull(x), y, x) + df1.combine(df2, combiner) + +.. _basics.stats: + +Descriptive statistics +---------------------- + +A large number of methods for computing descriptive statistics and other related +operations on :ref:`Series `, :ref:`DataFrame +`, and :ref:`Panel `. Most of these +are aggregations (hence producing a lower-dimensional result) like **sum**, +**mean**, and **quantile**, but some of them, like **cumsum** and **cumprod**, +produce an object of the same size. Generally speaking, these methods take an +**axis** argument, just like *ndarray.{sum, std, ...}*, but the axis can be +specified by name or integer: + + - **Series**: no axis argument needed + - **DataFrame**: "index" (axis=0, default), "columns" (axis=1) + - **Panel**: "items" (axis=0), "major" (axis=1, default), "minor" + (axis=2) + +For example: + +.. ipython:: python + + df + df.mean(0) + df.mean(1) + +All such methods have a ``skipna`` option signaling whether to exclude missing +data (``True`` by default): + +.. ipython:: python + + df.sum(0, skipna=False) + df.sum(axis=1, skipna=True) + +Combined with the broadcasting / arithmetic behavior, one can describe various +statistical procedures, like standardization (rendering data zero mean and +standard deviation 1), very concisely: + +.. ipython:: python + + ts_stand = (df - df.mean()) / df.std() + ts_stand.std() + xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0) + xs_stand.std(1) + +Note that methods like **cumsum** and **cumprod** preserve the location of NA +values: + +.. ipython:: python + + df.cumsum() + +Here is a quick reference summary table of common functions. Each also takes an +optional ``level`` parameter which applies only if the object has a +:ref:`hierarchical index`. + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``count``, Number of non-null observations + ``sum``, Sum of values + ``mean``, Mean of values + ``mad``, Mean absolute deviation + ``median``, Arithmetic median of values + ``min``, Minimum + ``max``, Maximum + ``abs``, Absolute Value + ``prod``, Product of values + ``std``, Unbiased standard deviation + ``var``, Unbiased variance + ``skew``, Unbiased skewness (3rd moment) + ``kurt``, Unbiased kurtosis (4th moment) + ``quantile``, Sample quantile (value at %) + ``cumsum``, Cumulative sum + ``cumprod``, Cumulative product + ``cummax``, Cumulative maximum + ``cummin``, Cumulative minimum + +Note that by chance some NumPy methods, like ``mean``, ``std``, and ``sum``, +will exclude NAs on Series input by default: + +.. ipython:: python + + np.mean(df['one']) + np.mean(df['one'].values) + +``Series`` also has a method ``nunique`` which will return the number of unique +non-null values: + +.. ipython:: python + + series = Series(randn(500)) + series[20:500] = np.nan + series[10:20] = 5 + series.nunique() + + +Summarizing data: describe +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is a convenient ``describe`` function which computes a variety of summary +statistics about a Series or the columns of a DataFrame (excluding NAs of +course): + +.. ipython:: python + + series = Series(randn(1000)) + series[::2] = np.nan + series.describe() + frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame.ix[::2] = np.nan + frame.describe() + +.. _basics.describe: + +For a non-numerical Series object, `describe` will give a simple summary of the +number of unique values and most frequently occurring values: + + +.. ipython:: python + + s = Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s.describe() + +There also is a utility function, ``value_range`` which takes a DataFrame and +returns a series with the minimum/maximum values in the DataFrame. + +.. _basics.idxmin: + +Index of Min/Max Values +~~~~~~~~~~~~~~~~~~~~~~~ + +The ``idxmin`` and ``idxmax`` functions on Series and DataFrame compute the +index labels with the minimum and maximum corresponding values: + +.. ipython:: python + + s1 = Series(randn(5)) + s1 + s1.idxmin(), s1.idxmax() + + df1 = DataFrame(randn(5,3), columns=['A','B','C']) + df1 + df1.idxmin(axis=0) + df1.idxmax(axis=1) + +Value counts (histogramming) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``value_counts`` Series method and top-level function computes a histogram +of a 1D array of values. It can also be used as a function on regular arrays: + +.. ipython:: python + + data = np.random.randint(0, 7, size=50) + data + s = Series(data) + s.value_counts() + value_counts(data) + + +Discretization and quantiling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Continuous values can be discretized using the ``cut`` (bins based on values) +and ``qcut`` (bins based on sample quantiles) functions: + +.. ipython:: python + + arr = np.random.randn(20) + factor = cut(arr, 4) + factor + + factor = cut(arr, [-5, -1, 0, 1, 5]) + factor + +``qcut`` computes sample quantiles. For example, we could slice up some +normally distributed data into equal-size quartiles like so: + +.. ipython:: python + + arr = np.random.randn(30) + factor = qcut(arr, [0, .25, .5, .75, 1]) + factor + value_counts(factor) + + +.. _basics.apply: + +Function application +-------------------- + +Arbitrary functions can be applied along the axes of a DataFrame or Panel +using the ``apply`` method, which, like the descriptive statistics methods, +take an optional ``axis`` argument: + +.. ipython:: python + + df.apply(np.mean) + df.apply(np.mean, axis=1) + df.apply(lambda x: x.max() - x.min()) + df.apply(np.cumsum) + df.apply(np.exp) + +Depending on the return type of the function passed to ``apply``, the result +will either be of lower dimension or the same dimension. + +``apply`` combined with some cleverness can be used to answer many questions +about a data set. For example, suppose we wanted to extract the date where the +maximum value for each column occurred: + + +.. ipython:: python + + tsdf = DataFrame(randn(1000, 3), columns=['A', 'B', 'C'], + index=date_range('1/1/2000', periods=1000)) + tsdf.apply(lambda x: x.index[x.dropna().argmax()]) + +You may also pass additional arguments and keyword arguments to the ``apply`` +method. For instance, consider the following function you would like to apply: + +.. code-block:: python + + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide + +You may then apply this function as follows: + +.. code-block:: python + + df.apply(subtract_and_divide, args=(5,), divide=3) + +Another useful feature is the ability to pass Series methods to carry out some +Series operation on each column or row: + +.. ipython:: python + :suppress: + + tsdf = DataFrame(randn(10, 3), columns=['A', 'B', 'C'], + index=date_range('1/1/2000', periods=10)) + tsdf.values[3:7] = np.nan + +.. ipython:: python + + tsdf + tsdf.apply(Series.interpolate) + +Finally, ``apply`` takes an argument ``raw`` which is False by default, which +converts each row or column into a Series before applying the function. When +set to True, the passed function will instead receive an ndarray object, which +has positive performance implications if you do not need the indexing +functionality. + +.. seealso:: + + The section on :ref:`GroupBy ` demonstrates related, flexible + functionality for grouping by some criterion, applying, and combining the + results into a Series, DataFrame, etc. + +Applying elementwise Python functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since not all functions can be vectorized (accept NumPy arrays and return +another array or value), the methods ``applymap`` on DataFrame and analogously +``map`` on Series accept any Python function taking a single value and +returning a single value. For example: + +.. ipython:: python + + f = lambda x: len(str(x)) + df['one'].map(f) + df.applymap(f) + +``Series.map`` has an additional feature which is that it can be used to easily +"link" or "map" values defined by a secondary series. This is closely related +to :ref:`merging/joining functionality `: + + +.. ipython:: python + + s = Series(['six', 'seven', 'six', 'seven', 'six'], + index=['a', 'b', 'c', 'd', 'e']) + t = Series({'six' : 6., 'seven' : 7.}) + s + s.map(t) + +.. _basics.reindexing: + +Reindexing and altering labels +------------------------------ + +``reindex`` is the fundamental data alignment method in pandas. It is used to +implement nearly all other features relying on label-alignment +functionality. To *reindex* means to conform the data to match a given set of +labels along a particular axis. This accomplishes several things: + + * Reorders the existing data to match a new set of labels + * Inserts missing value (NA) markers in label locations where no data for + that label existed + * If specified, **fill** data for missing labels using logic (highly relevant + to working with time series data) + +Here is a simple example: + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s + s.reindex(['e', 'b', 'f', 'd']) + +Here, the ``f`` label was not contained in the Series and hence appears as +``NaN`` in the result. + +With a DataFrame, you can simultaneously reindex the index and columns: + +.. ipython:: python + + df + df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) + +For convenience, you may utilize the ``reindex_axis`` method, which takes the +labels and a keyword ``axis`` parameter. + +Note that the ``Index`` objects containing the actual axis labels can be +**shared** between objects. So if we have a Series and a DataFrame, the +following can be done: + +.. ipython:: python + + rs = s.reindex(df.index) + rs + rs.index is df.index + +This means that the reindexed Series's index is the same Python object as the +DataFrame's index. + + +.. seealso:: + + :ref:`Advanced indexing ` is an even more concise way of + doing reindexing. + +.. note:: + + When writing performance-sensitive code, there is a good reason to spend + some time becoming a reindexing ninja: **many operations are faster on + pre-aligned data**. Adding two unaligned DataFrames internally triggers a + reindexing step. For exploratory analysis you will hardly notice the + difference (because ``reindex`` has been heavily optimized), but when CPU + cycles matter sprinking a few explicit ``reindex`` calls here and there can + have an impact. + +.. _basics.reindex_like: + +Reindexing to align with another object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may wish to take an object and reindex its axes to be labeled the same as +another object. While the syntax for this is straightforward albeit verbose, it +is a common enough operation that the ``reindex_like`` method is available to +make this simpler: + +.. ipython:: python + :suppress: + + df2 = df.reindex(['a', 'b', 'c'], columns=['one', 'two']) + df2 = df2 - df2.mean() + + +.. ipython:: python + + df + df2 + df.reindex_like(df2) + +Reindexing with ``reindex_axis`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _basics.align: + +Aligning objects with each other with ``align`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``align`` method is the fastest way to simultaneously align two objects. It +supports a ``join`` argument (related to :ref:`joining and merging `): + + - ``join='outer'``: take the union of the indexes + - ``join='left'``: use the calling object's index + - ``join='right'``: use the passed object's index + - ``join='inner'``: intersect the indexes + +It returns a tuple with both of the reindexed Series: + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s1 = s[:4] + s2 = s[1:] + s1.align(s2) + s1.align(s2, join='inner') + s1.align(s2, join='left') + +.. _basics.df_join: + +For DataFrames, the join method will be applied to both the index and the +columns by default: + +.. ipython:: python + + df.align(df2, join='inner') + +You can also pass an ``axis`` option to only align on the specified axis: + +.. ipython:: python + + df.align(df2, join='inner', axis=0) + +.. _basics.align.frame.series: + +If you pass a Series to ``DataFrame.align``, you can choose to align both +objects either on the DataFrame's index or columns using the ``axis`` argument: + +.. ipython:: python + + df.align(df2.ix[0], axis=1) + +.. _basics.reindex_fill: + +Filling while reindexing +~~~~~~~~~~~~~~~~~~~~~~~~ + +``reindex`` takes an optional parameter ``method`` which is a filling method +chosen from the following table: + +.. csv-table:: + :header: "Method", "Action" + :widths: 30, 50 + + pad / ffill, Fill values forward + bfill / backfill, Fill values backward + +Other fill methods could be added, of course, but these are the two most +commonly used for time series data. In a way they only make sense for time +series or otherwise ordered data, but you may have an application on non-time +series data where this sort of "interpolation" logic is the correct thing to +do. More sophisticated interpolation of missing values would be an obvious +extension. + +We illustrate these fill methods on a simple TimeSeries: + +.. ipython:: python + + rng = date_range('1/3/2000', periods=8) + ts = Series(randn(8), index=rng) + ts2 = ts[[0, 3, 6]] + ts + ts2 + + ts2.reindex(ts.index) + ts2.reindex(ts.index, method='ffill') + ts2.reindex(ts.index, method='bfill') + +Note the same result could have been achieved using :ref:`fillna +`: + +.. ipython:: python + + ts2.reindex(ts.index).fillna(method='ffill') + +Note these methods generally assume that the indexes are **sorted**. They may +be modified in the future to be a bit more flexible but as time series data is +ordered most of the time anyway, this has not been a major priority. + +.. _basics.drop: + +Dropping labels from an axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A method closely related to ``reindex`` is the ``drop`` function. It removes a +set of labels from an axis: + +.. ipython:: python + + df + df.drop(['a', 'd'], axis=0) + df.drop(['one'], axis=1) + +Note that the following also works, but is a bit less obvious / clean: + +.. ipython:: python + + df.reindex(df.index - ['a', 'd']) + +.. _basics.rename: + +Renaming / mapping labels +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``rename`` method allows you to relabel an axis based on some mapping (a +dict or Series) or an arbitrary function. + +.. ipython:: python + + s + s.rename(str.upper) + +If you pass a function, it must return a value when called with any of the +labels (and must produce a set of unique values). But if you pass a dict or +Series, it need only contain a subset of the labels as keys: + +.. ipython:: python + + df.rename(columns={'one' : 'foo', 'two' : 'bar'}, + index={'a' : 'apple', 'b' : 'banana', 'd' : 'durian'}) + +The ``rename`` method also provides an ``inplace`` named parameter that is by +default ``False`` and copies the underlying data. Pass ``inplace=True`` to +rename the data in place. + +.. _basics.rename_axis: + +The Panel class has a related ``rename_axis`` class which can rename any of +its three axes. + +Iteration +--------- + +Because Series is array-like, basic iteration produces the values. Other data +structures follow the dict-like convention of iterating over the "keys" of the +objects. In short: + + * **Series**: values + * **DataFrame**: column labels + * **Panel**: item labels + +Thus, for example: + +.. ipython:: + + In [0]: for col in df: + ...: print col + ...: + +iteritems +~~~~~~~~~ + +Consistent with the dict-like interface, **iteritems** iterates through +key-value pairs: + + * **Series**: (index, scalar value) pairs + * **DataFrame**: (column, Series) pairs + * **Panel**: (item, DataFrame) pairs + +For example: + +.. ipython:: + + In [0]: for item, frame in wp.iteritems(): + ...: print item + ...: print frame + ...: + + +.. _basics.iterrows: + +iterrows +~~~~~~~~ + +New in v0.7 is the ability to iterate efficiently through rows of a +DataFrame. It returns an iterator yielding each index value along with a Series +containing the data in each row: + +.. ipython:: + + In [0]: for row_index, row in df2.iterrows(): + ...: print '%s\n%s' % (row_index, row) + ...: + + +For instance, a contrived way to transpose the dataframe would be: + +.. ipython:: python + + df2 = DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + print df2 + print df2.T + + df2_t = DataFrame(dict((idx,values) for idx, values in df2.iterrows())) + print df2_t + +itertuples +~~~~~~~~~~ + +This method will return an iterator yielding a tuple for each row in the +DataFrame. The first element of the tuple will be the row's corresponding index +value, while the remaining values are the row values proper. + +For instance, + +.. ipython:: python + + for r in df2.itertuples(): print r + +.. _basics.sorting: + +Sorting by index and value +-------------------------- + +There are two obvious kinds of sorting that you may be interested in: sorting +by label and sorting by actual values. The primary method for sorting axis +labels (indexes) across data structures is the ``sort_index`` method. + +.. ipython:: python + + unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], + columns=['three', 'two', 'one']) + unsorted_df.sort_index() + unsorted_df.sort_index(ascending=False) + unsorted_df.sort_index(axis=1) + +``DataFrame.sort_index`` can accept an optional ``by`` argument for ``axis=0`` +which will use an arbitrary vector or a column name of the DataFrame to +determine the sort order: + +.. ipython:: python + + df.sort_index(by='two') + +The ``by`` argument can take a list of column names, e.g.: + +.. ipython:: python + + df = DataFrame({'one':[2,1,1,1],'two':[1,3,2,4],'three':[5,4,3,2]}) + df[['one', 'two', 'three']].sort_index(by=['one','two']) + +Series has the method ``order`` (analogous to `R's order function +`__) which +sorts by value, with special treatment of NA values via the ``na_last`` +argument: + +.. ipython:: python + + s[2] = np.nan + s.order() + s.order(na_last=False) + +Some other sorting notes / nuances: + + * ``Series.sort`` sorts a Series by value in-place. This is to provide + compatibility with NumPy methods which expect the ``ndarray.sort`` + behavior. + * ``DataFrame.sort`` takes a ``column`` argument instead of ``by``. This + method will likely be deprecated in a future release in favor of just using + ``sort_index``. + +.. _basics.cast: + +Copying, type casting +--------------------- + +The ``copy`` method on pandas objects copies the underlying data (though not +the axis indexes, since they are immutable) and returns a new object. Note that +**it is seldom necessary to copy objects**. For example, there are only a +handful of ways to alter a DataFrame *in-place*: + + * Inserting, deleting, or modifying a column + * Assigning to the ``index`` or ``columns`` attributes + * For homogeneous data, directly modifying the values via the ``values`` + attribute or advanced indexing + +To be clear, no pandas methods have the side effect of modifying your data; +almost all methods return new objects, leaving the original object +untouched. If data is modified, it is because you did so explicitly. + +Data can be explicitly cast to a NumPy dtype by using the ``astype`` method or +alternately passing the ``dtype`` keyword argument to the object constructor. + +.. ipython:: python + + df = DataFrame(np.arange(12).reshape((4, 3))) + df[0].dtype + df.astype(float)[0].dtype + df = DataFrame(np.arange(12).reshape((4, 3)), dtype=float) + df[0].dtype + +.. _basics.cast.infer: + +Inferring better types for object columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``convert_objects`` DataFrame method will attempt to convert +``dtype=object`` columns to a better NumPy dtype. Occasionally (after +transposing multiple times, for example), a mixed-type DataFrame will end up +with everything as ``dtype=object``. This method attempts to fix that: + +.. ipython:: python + + df = DataFrame(randn(6, 3), columns=['a', 'b', 'c']) + df['d'] = 'foo' + df + df = df.T.T + df.dtypes + converted = df.convert_objects() + converted.dtypes + +.. _basics.serialize: + +Pickling and serialization +-------------------------- + +All pandas objects are equipped with ``save`` methods which use Python's +``cPickle`` module to save data structures to disk using the pickle format. + +.. ipython:: python + + df + df.save('foo.pickle') + +The ``load`` function in the ``pandas`` namespace can be used to load any +pickled pandas object (or any other pickled object) from file: + + +.. ipython:: python + + load('foo.pickle') + +There is also a ``save`` function which takes any object as its first argument: + +.. ipython:: python + + save(df, 'foo.pickle') + load('foo.pickle') + +.. ipython:: python + :suppress: + + import os + os.remove('foo.pickle') + +Console Output Formatting +------------------------- + +.. _basics.console_output: + +Use the ``set_eng_float_format`` function in the ``pandas.core.common`` module +to alter the floating-point formatting of pandas objects to produce a particular +format. + +For instance: + +.. ipython:: python + + set_eng_float_format(accuracy=3, use_eng_prefix=True) + df['a']/1.e3 + df['a']/1.e6 + +.. ipython:: python + :suppress: + + reset_printoptions() + + +The ``set_printoptions`` function has a number of options for controlling how +floating point numbers are formatted (using hte ``precision`` argument) in the +console and . The ``max_rows`` and ``max_columns`` control how many rows and +columns of DataFrame objects are shown by default. If ``max_columns`` is set to +0 (the default, in fact), the library will attempt to fit the DataFrame's +string representation into the current terminal width, and defaulting to the +summary view otherwise. diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst new file mode 100644 index 00000000..57597680 --- /dev/null +++ b/doc/source/comparison_with_r.rst @@ -0,0 +1,38 @@ +.. currentmodule:: pandas +.. _compare_with_r: + +******************************* +Comparison with R / R libraries +******************************* + +Since pandas aims to provide a lot of the data manipulation and analysis +functionality that people use R for, this page was started to provide a more +detailed look at the R language and it's many 3rd party libraries as they +relate to pandas. In offering comparisons with R and CRAN libraries, we care +about the following things: + + - **Functionality / flexibility**: what can / cannot be done with each tool + - **Performance**: how fast are operations. Hard numbers / benchmarks are + preferable + - **Ease-of-use**: is one tool easier or harder to use (you may have to be + the judge of this given side-by-side code comparisons) + +As I do not have an encyclopedic knowledge of R packages, feel free to suggest +additional CRAN packages to add to this list. This is also here to offer a big +of a translation guide for users of these R packages. + +data.frame +---------- + +zoo +--- + +xts +--- + +plyr +---- + +reshape / reshape2 +------------------ + diff --git a/doc/source/computation.rst b/doc/source/computation.rst new file mode 100644 index 00000000..911601d2 --- /dev/null +++ b/doc/source/computation.rst @@ -0,0 +1,536 @@ +.. currentmodule:: pandas +.. _computation: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + +Computational tools +=================== + +Statistical functions +--------------------- + +.. _computation.pct_change: + +Percent Change +~~~~~~~~~~~~~~ + +Both ``Series`` and ``DataFrame`` has a method ``pct_change`` to compute the +percent change over a given number of periods (using ``fill_method`` to fill +NA/null values). + +.. ipython:: python + + ser = Series(randn(8)) + + ser.pct_change() + +.. ipython:: python + + df = DataFrame(randn(10, 4)) + + df.pct_change(periods=3) + +.. _computation.covariance: + +Covariance +~~~~~~~~~~ + +The ``Series`` object has a method ``cov`` to compute covariance between series +(excluding NA/null values). + +.. ipython:: python + + s1 = Series(randn(1000)) + s2 = Series(randn(1000)) + s1.cov(s2) + +Analogously, ``DataFrame`` has a method ``cov`` to compute pairwise covariances +among the series in the DataFrame, also excluding NA/null values. + +.. ipython:: python + + frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame.cov() + +.. _computation.correlation: + +Correlation +~~~~~~~~~~~ + +Several methods for computing correlations are provided. Several kinds of +correlation methods are provided: + +.. csv-table:: + :header: "Method name", "Description" + :widths: 20, 80 + + ``pearson (default)``, Standard correlation coefficient + ``kendall``, Kendall Tau correlation coefficient + ``spearman``, Spearman rank correlation coefficient + +.. \rho = \cov(x, y) / \sigma_x \sigma_y + +All of these are currently computed using pairwise complete observations. + +.. ipython:: python + + frame = DataFrame(randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame.ix[::2] = np.nan + + # Series with Series + frame['a'].corr(frame['b']) + frame['a'].corr(frame['b'], method='spearman') + + # Pairwise correlation of DataFrame columns + frame.corr() + +Note that non-numeric columns will be automatically excluded from the +correlation calculation. + +A related method ``corrwith`` is implemented on DataFrame to compute the +correlation between like-labeled Series contained in different DataFrame +objects. + +.. ipython:: python + + index = ['a', 'b', 'c', 'd', 'e'] + columns = ['one', 'two', 'three', 'four'] + df1 = DataFrame(randn(5, 4), index=index, columns=columns) + df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) + df1.corrwith(df2) + df2.corrwith(df1, axis=1) + +.. _computation.ranking: + +Data ranking +~~~~~~~~~~~~ + +The ``rank`` method produces a data ranking with ties being assigned the mean +of the ranks (by default) for the group: + +.. ipython:: python + + s = Series(np.random.randn(5), index=list('abcde')) + s['d'] = s['b'] # so there's a tie + s.rank() + +``rank`` is also a DataFrame method and can rank either the rows (``axis=0``) +or the columns (``axis=1``). ``NaN`` values are excluded from the ranking. + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 6)) + df[4] = df[2][:5] # some ties + df + df.rank(1) + +``rank`` optionally takes a parameter ``ascending`` which by default is true; +when false, data is reverse-ranked, with larger values assigned a smaller rank. + +``rank`` supports different tie-breaking methods, specified with the ``method`` +parameter: + + - ``average`` : average rank of tied group + - ``min`` : lowest rank in the group + - ``max`` : highest rank in the group + - ``first`` : ranks assigned in the order they appear in the array + +.. note:: + + These methods are significantly faster (around 10-20x) than + ``scipy.stats.rankdata``. + +.. currentmodule:: pandas + +.. currentmodule:: pandas.stats.api + +.. _stats.moments: + +Moving (rolling) statistics / moments +------------------------------------- + +For working with time series data, a number of functions are provided for +computing common *moving* or *rolling* statistics. Among these are count, sum, +mean, median, correlation, variance, covariance, standard deviation, skewness, +and kurtosis. All of these methods are in the :mod:`pandas` namespace, but +otherwise they can be found in :mod:`pandas.stats.moments`. + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``rolling_count``, Number of non-null observations + ``rolling_sum``, Sum of values + ``rolling_mean``, Mean of values + ``rolling_median``, Arithmetic median of values + ``rolling_min``, Minimum + ``rolling_max``, Maximum + ``rolling_std``, Unbiased standard deviation + ``rolling_var``, Unbiased variance + ``rolling_skew``, Unbiased skewness (3rd moment) + ``rolling_kurt``, Unbiased kurtosis (4th moment) + ``rolling_quantile``, Sample quantile (value at %) + ``rolling_apply``, Generic apply + ``rolling_cov``, Unbiased covariance (binary) + ``rolling_corr``, Correlation (binary) + ``rolling_corr_pairwise``, Pairwise correlation of DataFrame columns + +Generally these methods all have the same interface. The binary operators +(e.g. ``rolling_corr``) take two Series or DataFrames. Otherwise, they all +accept the following arguments: + + - ``window``: size of moving window + - ``min_periods``: threshold of non-null data points to require (otherwise + result is NA) + - ``freq``: optionally specify a :ref: `frequency string ` + or :ref:`DateOffset ` to pre-conform the data to. + Note that prior to pandas v0.8.0, a keyword argument ``time_rule`` was used + instead of ``freq`` that referred to the legacy time rule constants + +These functions can be applied to ndarrays or Series objects: + +.. ipython:: python + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + ts.plot(style='k--') + + @savefig rolling_mean_ex.png width=4.5in + rolling_mean(ts, 60).plot(style='k') + +They can also be applied to DataFrame objects. This is really just syntactic +sugar for applying the moving window operator to all of the DataFrame's columns: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + df = DataFrame(randn(1000, 4), index=ts.index, + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + + @savefig rolling_mean_frame.png width=4.5in + rolling_sum(df, 60).plot(subplots=True) + +The ``rolling_apply`` function takes an extra ``func`` argument and performs +generic rolling computations. The ``func`` argument should be a single function +that produces a single value from an ndarray input. Suppose we wanted to +compute the mean absolute deviation on a rolling basis: + +.. ipython:: python + + mad = lambda x: np.fabs(x - x.mean()).mean() + @savefig rolling_apply_ex.png width=4.5in + rolling_apply(ts, 60, mad).plot(style='k') + +.. _stats.moments.binary: + +Binary rolling moments +~~~~~~~~~~~~~~~~~~~~~~ + +``rolling_cov`` and ``rolling_corr`` can compute moving window statistics about +two ``Series`` or any combination of ``DataFrame/Series`` or +``DataFrame/DataFrame``. Here is the behavior in each case: + +- two ``Series``: compute the statistic for the pairing +- ``DataFrame/Series``: compute the statistics for each column of the DataFrame + with the passed Series, thus returning a DataFrame +- ``DataFrame/DataFrame``: compute statistic for matching column names, + returning a DataFrame + +For example: + +.. ipython:: python + + df2 = df[:20] + rolling_corr(df2, df2['B'], window=5) + +.. _stats.moments.corr_pairwise: + +Computing rolling pairwise correlations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In financial data analysis and other fields it's common to compute correlation +matrices for a collection of time series. More difficult is to compute a +moving-window correlation matrix. This can be done using the +``rolling_corr_pairwise`` function, which yields a ``Panel`` whose ``items`` +are the dates in question: + +.. ipython:: python + + correls = rolling_corr_pairwise(df, 50) + correls[df.index[-50]] + +You can efficiently retrieve the time series of correlations between two +columns using ``ix`` indexing: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + @savefig rolling_corr_pairwise_ex.png width=4.5in + correls.ix[:, 'A', 'C'].plot() + +Exponentially weighted moment functions +--------------------------------------- + +A related set of functions are exponentially weighted versions of many of the +above statistics. A number of EW (exponentially weighted) functions are +provided using the blending method. For example, where :math:`y_t` is the +result and :math:`x_t` the input, we compute an exponentially weighted moving +average as + +.. math:: + + y_t = (1-\alpha) y_{t-1} + \alpha x_t + +One must have :math:`0 < \alpha \leq 1`, but rather than pass :math:`\alpha` +directly, it's easier to think about either the **span** or **center of mass +(com)** of an EW moment: + +.. math:: + + \alpha = + \begin{cases} + \frac{2}{s + 1}, s = \text{span}\\ + \frac{1}{c + 1}, c = \text{center of mass} + \end{cases} + +You can pass one or the other to these functions but not both. **Span** +corresponds to what is commonly called a "20-day EW moving average" for +example. **Center of mass** has a more physical interpretation. For example, +**span** = 20 corresponds to **com** = 9.5. Here is the list of functions +available: + +.. csv-table:: + :header: "Function", "Description" + :widths: 20, 80 + + ``ewma``, EW moving average + ``ewvar``, EW moving variance + ``ewstd``, EW moving standard deviation + ``ewmcorr``, EW moving correlation + ``ewmcov``, EW moving covariance + +Here are an example for a univariate time series: + +.. ipython:: python + + plt.close('all') + ts.plot(style='k--') + + @savefig ewma_ex.png width=4.5in + ewma(ts, span=20).plot(style='k') + +.. note:: + + The EW functions perform a standard adjustment to the initial observations + whereby if there are fewer observations than called for in the span, those + observations are reweighted accordingly. + +.. _stats.ols: + +Linear and panel regression +--------------------------- + +.. note:: + + We plan to move this functionality to `statsmodels + `__ for the next release. Some of the + result attributes may change names in order to foster naming consistency + with the rest of statsmodels. We will provide every effort to provide + compatibility with older versions of pandas, however. + +We have implemented a very fast set of *moving-window linear regression* +classes in pandas. Two different types of regressions are supported: + + - Standard ordinary least squares (OLS) multiple regression + - Multiple regression (OLS-based) on `panel data + `__ including with fixed-effects + (also known as entity or individual effects) or time-effects. + +Both kinds of linear models are accessed through the ``ols`` function in the +pandas namespace. They all take the following arguments to specify either a +static (full sample) or dynamic (moving window) regression: + + - ``window_type``: ``'full sample'`` (default), ``'expanding'``, or + ``rolling`` + - ``window``: size of the moving window in the ``window_type='rolling'`` + case. If ``window`` is specified, ``window_type`` will be automatically set + to ``'rolling'`` + - ``min_periods``: minimum number of time periods to require to compute the + regression coefficients + +Generally speaking, the ``ols`` works by being given a ``y`` (response) object +and an ``x`` (predictors) object. These can take many forms: + + - ``y``: a Series, ndarray, or DataFrame (panel model) + - ``x``: Series, DataFrame, dict of Series, dict of DataFrame or Panel + +Based on the types of ``y`` and ``x``, the model will be inferred to either a +panel model or a regular linear model. If the ``y`` variable is a DataFrame, +the result will be a panel model. In this case, the ``x`` variable must either +be a Panel, or a dict of DataFrame (which will be coerced into a Panel). + +Standard OLS regression +~~~~~~~~~~~~~~~~~~~~~~~ + +Let's pull in some sample data: + +.. ipython:: python + + from pandas.io.data import DataReader + symbols = ['MSFT', 'GOOG', 'AAPL'] + data = dict((sym, DataReader(sym, "yahoo")) + for sym in symbols) + panel = Panel(data).swapaxes('items', 'minor') + close_px = panel['Close'] + + # convert closing prices to returns + rets = close_px / close_px.shift(1) - 1 + rets.info() + +Let's do a static regression of ``AAPL`` returns on ``GOOG`` returns: + +.. ipython:: python + + model = ols(y=rets['AAPL'], x=rets.ix[:, ['GOOG']]) + model + model.beta + +If we had passed a Series instead of a DataFrame with the single ``GOOG`` +column, the model would have assigned the generic name ``x`` to the sole +right-hand side variable. + +We can do a moving window regression to see how the relationship changes over +time: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + model = ols(y=rets['AAPL'], x=rets.ix[:, ['GOOG']], + window=250) + + # just plot the coefficient for GOOG + @savefig moving_lm_ex.png width=5in + model.beta['GOOG'].plot() + +It looks like there are some outliers rolling in and out of the window in the +above regression, influencing the results. We could perform a simple +`winsorization `__ at the 3 STD level +to trim the impact of outliers: + +.. ipython:: python + :suppress: + + plt.close('all') + +.. ipython:: python + + winz = rets.copy() + std_1year = rolling_std(rets, 250, min_periods=20) + + # cap at 3 * 1 year standard deviation + cap_level = 3 * np.sign(winz) * std_1year + winz[np.abs(winz) > 3 * std_1year] = cap_level + + winz_model = ols(y=winz['AAPL'], x=winz.ix[:, ['GOOG']], + window=250) + + model.beta['GOOG'].plot(label="With outliers") + + @savefig moving_lm_winz.png width=5in + winz_model.beta['GOOG'].plot(label="Winsorized"); plt.legend(loc='best') + +So in this simple example we see the impact of winsorization is actually quite +significant. Note the correlation after winsorization remains high: + +.. ipython:: python + + winz.corrwith(rets) + +Multiple regressions can be run by passing a DataFrame with multiple columns +for the predictors ``x``: + +.. ipython:: python + + ols(y=winz['AAPL'], x=winz.drop(['AAPL'], axis=1)) + +Panel regression +~~~~~~~~~~~~~~~~ + +We've implemented moving window panel regression on potentially unbalanced +panel data (see `this article `__ if +this means nothing to you). Suppose we wanted to model the relationship between +the magnitude of the daily return and trading volume among a group of stocks, +and we want to pool all the data together to run one big regression. This is +actually quite easy: + +.. ipython:: python + + # make the units somewhat comparable + volume = panel['Volume'] / 1e8 + model = ols(y=volume, x={'return' : np.abs(rets)}) + model + +In a panel model, we can insert dummy (0-1) variables for the "entities" +involved (here, each of the stocks) to account the a entity-specific effect +(intercept): + +.. ipython:: python + + fe_model = ols(y=volume, x={'return' : np.abs(rets)}, + entity_effects=True) + fe_model + +Because we ran the regression with an intercept, one of the dummy variables +must be dropped or the design matrix will not be full rank. If we do not use an +intercept, all of the dummy variables will be included: + +.. ipython:: python + + fe_model = ols(y=volume, x={'return' : np.abs(rets)}, + entity_effects=True, intercept=False) + fe_model + +We can also include *time effects*, which demeans the data cross-sectionally at +each point in time (equivalent to including dummy variables for each +date). More mathematical care must be taken to properly compute the standard +errors in this case: + +.. ipython:: python + + te_model = ols(y=volume, x={'return' : np.abs(rets)}, + time_effects=True, entity_effects=True) + te_model + +Here the intercept (the mean term) is dropped by default because it will be 0 +according to the model assumptions, having subtracted off the group means. + +Result fields and tests +~~~~~~~~~~~~~~~~~~~~~~~ + +We'll leave it to the user to explore the docstrings and source, especially as +we'll be moving this code into statsmodels in the near future. + diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 00000000..3969af93 --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,248 @@ +# -*- coding: utf-8 -*- +# +# pandas documentation build configuration file, created by +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('../sphinxext')) + +sys.path.extend([ + + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), + '..', '../..', + 'sphinxext') + +]) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. + +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.extlinks', + 'sphinx.ext.todo', + 'numpydoc', + 'ipython_directive', + 'ipython_console_highlighting', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.pngmath', + 'sphinx.ext.ifconfig', + 'sphinx.ext.autosummary', + 'matplotlib.sphinxext.only_directives', + 'matplotlib.sphinxext.plot_directive', + ] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates', '_templates/autosummary'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pandas' +copyright = u'2008-2012, the pandas development team' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import pandas + +# version = '%s r%s' % (pandas.__version__, svn_version()) +version = '%s' % (pandas.__version__) + +# The full version, including alpha/beta/rc tags. +release = version + +# JP: added from sphinxdocs +autosummary_generate = True + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'agogo' + +# The style sheet to use for HTML and HTML Help pages. A file of that name +# must exist either in Sphinx' static/ path, or in one of the custom paths +# given in html_static_path. +#html_style = 'statsmodels.css' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['themes'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pandas' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'pandas.tex', + u'pandas: powerful Python data analysis toolkit', + u'Wes McKinney\n\& PyData Development Team', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + + +# Example configuration for intersphinx: refer to the Python standard library. +# intersphinx_mapping = {'http://docs.scipy.org/': None} +import glob +autosummary_generate = glob.glob("*.rst") + +# extlinks alias +extlinks = {'issue': ('https://github.com/pydata/pandas/issues/%s', + 'issue ')} + +extlinks = {'pull request': ('https://github.com/pydata/pandas/pulls/%s', + 'pull request ')} + +extlinks = {'wiki': ('https://github.com/pydata/pandas/pulls/%s', + 'wiki ')} diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst new file mode 100644 index 00000000..23f603a8 --- /dev/null +++ b/doc/source/dsintro.rst @@ -0,0 +1,788 @@ +.. currentmodule:: pandas +.. _dsintro: + +************************ +Intro to Data Structures +************************ + +We'll start with a quick, non-comprehensive overview of the fundamental data +structures in pandas to get you started. The fundamental behavior about data +types, indexing, and axis labeling / alignment apply across all of the +objects. To get started, import numpy and load pandas into your namespace: + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + set_printoptions(precision=4, max_columns=8) + +.. ipython:: python + + import numpy as np + # will use a lot in examples + randn = np.random.randn + from pandas import * + +Here is a basic tenet to keep in mind: **data alignment is intrinsic**. Link +between labels and data will not be broken unless done so explicitly by you. + +We'll give a brief intro to the data structures, then consider all of the broad +categories of functionality and methods in separate sections. + +.. _basics.series: + +Series +------ + +:class:`Series` is a one-dimensional labeled array (technically a subclass of +ndarray) capable of holding any data type (integers, strings, floating point +numbers, Python objects, etc.). The axis labels are collectively referred to as +the **index**. The basic method to create a Series is to call: + +:: + + >>> s = Series(data, index=index) + +Here, ``data`` can be many different things: + + - a Python dict + - an ndarray + - a scalar value (like 5) + +The passed **index** is a list of axis labels. Thus, this separates into a few +cases depending on what **data is**: + +**From ndarray** + +If ``data`` is an ndarray, **index** must be the same length as **data**. If no +index is passed, one will be created having values ``[0, ..., len(data) - 1]``. + +.. ipython:: python + + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + s + s.index + + Series(randn(5)) + +.. note:: + + Starting in v0.8.0, pandas supports non-unique index values. In previous + version, if the index values are not unique an exception will + **not** be raised immediately, but attempting any operation involving the + index will later result in an exception. In other words, the Index object + containing the labels "lazily" checks whether the values are unique. The + reason for being lazy is nearly all performance-based (there are many + instances in computations, like parts of GroupBy, where the index is not + used). + +**From dict** + +If ``data`` is a dict, if **index** is passed the values in data corresponding +to the labels in the index will be pulled out. Otherwise, an index will be +constructed from the sorted keys of the dict, if possible. + +.. ipython:: python + + d = {'a' : 0., 'b' : 1., 'c' : 2.} + Series(d) + Series(d, index=['b', 'c', 'd', 'a']) + +.. note:: + + NaN (not a number) is the standard missing data marker used in pandas + +**From scalar value** If ``data`` is a scalar value, an index must be +provided. The value will be repeated to match the length of **index** + +.. ipython:: python + + Series(5., index=['a', 'b', 'c', 'd', 'e']) + +Series is ndarray-like +~~~~~~~~~~~~~~~~~~~~~~ + +As a subclass of ndarray, Series is a valid argument to most NumPy functions +and behaves similarly to a NumPy array. However, things like slicing also slice +the index. + +.. ipython :: python + + s[0] + s[:3] + s[s > s.median()] + s[[4, 3, 1]] + np.exp(s) + +We will address array-based indexing in a separate :ref:`section `. + +Series is dict-like +~~~~~~~~~~~~~~~~~~~ + +A Series is alike a fixed-size dict in that you can get and set values by index +label: + +.. ipython :: python + + s['a'] + s['e'] = 12. + s + 'e' in s + 'f' in s + +If a label is not contained, an exception + +.. code-block:: python + + >>> s['f'] + KeyError: 'f' + + >>> s.get('f') + nan + +Vectorized operations and label alignment with Series +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When doing data analysis, as with raw NumPy arrays looping through Series +value-by-value is usually not necessary. Series can be also be passed into most +NumPy methods expecting an ndarray. + + +.. ipython:: python + + s + s + s * 2 + np.exp(s) + +A key difference between Series and ndarray is that operations between Series +automatically align the data based on label. Thus, you can write computations +without giving consideration to whether the Series involved have the same +labels. + +.. ipython:: python + + s[1:] + s[:-1] + +The result of an operation between unaligned Series will have the **union** of +the indexes involved. If a label is not found in one Series or the other, the +result will be marked as missing (NaN). Being able to write code without doing +any explicit data alignment grants immense freedom and flexibility in +interactive data analysis and research. The integrated data alignment features +of the pandas data structures set pandas apart from the majority of related +tools for working with labeled data. + +.. note:: + + In general, we chose to make the default result of operations between + differently indexed objects yield the **union** of the indexes in order to + avoid loss of information. Having an index label, though the data is + missing, is typically important information as part of a computation. You + of course have the option of dropping labels with missing data via the + **dropna** function. + +Name attribute +~~~~~~~~~~~~~~ + +.. _dsintro.name_attribute: + +Series can also have a ``name`` attribute: + +.. ipython:: python + + s = Series(np.random.randn(5), name='something') + s + s.name + +The Series ``name`` will be assigned automatically in many cases, in particular +when taking 1D slices of DataFrame as you will see below. + +.. _basics.dataframe: + +DataFrame +--------- + +**DataFrame** is a 2-dimensional labeled data structure with columns of +potentially different types. You can think of it like a spreadsheet or SQL +table, or a dict of Series objects. It is generally the most commonly used +pandas object. Like Series, DataFrame accepts many different kinds of input: + + - Dict of 1D ndarrays, lists, dicts, or Series + - 2-D numpy.ndarray + - `Structured or record + `__ ndarray + - A ``Series`` + - Another ``DataFrame`` + +Along with the data, you can optionally pass **index** (row labels) and +**columns** (column labels) arguments. If you pass an index and / or columns, +you are guaranteeing the index and / or columns of the resulting +DataFrame. Thus, a dict of Series plus a specific index will discard all data +not matching up to the passed index. + +If axis labels are not passed, they will be constructed from the input data +based on common sense rules. + +From dict of Series or dicts +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The result **index** will be the **union** of the indexes of the various +Series. If there are any nested dicts, these will be first converted to +Series. If no columns are passed, the columns will be the sorted list of dict +keys. + +.. ipython:: python + + d = {'one' : Series([1., 2., 3.], index=['a', 'b', 'c']), + 'two' : Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + df = DataFrame(d) + df + + DataFrame(d, index=['d', 'b', 'a']) + DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three']) + +The row and column labels can be accessed respectively by accessing the +**index** and **columns** attributes: + +.. note:: + + When a particular set of columns is passed along with a dict of data, the + passed columns override the keys in the dict. + +.. ipython:: python + + df.index + df.columns + +From dict of ndarrays / lists +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ndarrays must all be the same length. If an index is passed, it must +clearly also be the same length as the arrays. If no index is passed, the +result will be ``range(n)``, where ``n`` is the array length. + +.. ipython:: python + + d = {'one' : [1., 2., 3., 4.], + 'two' : [4., 3., 2., 1.]} + DataFrame(d) + DataFrame(d, index=['a', 'b', 'c', 'd']) + +From structured or record array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This case is handled identically to a dict of arrays. + +.. ipython:: python + + data = np.zeros((2,),dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')]) + data[:] = [(1,2.,'Hello'),(2,3.,"World")] + + DataFrame(data) + DataFrame(data, index=['first', 'second']) + DataFrame(data, columns=['C', 'A', 'B']) + +.. note:: + + DataFrame is not intended to work exactly like a 2-dimensional NumPy + ndarray. + +.. _basics.dataframe.from_list_of_dicts: + +From a list of dicts +~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] + DataFrame(data2) + DataFrame(data2, index=['first', 'second']) + DataFrame(data2, columns=['a', 'b']) + +.. _basics.dataframe.from_series: + +From a Series +~~~~~~~~~~~~~ + +The result will be a DataFrame with the same index as the input Series, and +with one column whose name is the original name of the Series (only if no other +column name provided). + +**Missing Data** + +Much more will be said on this topic in the :ref:`Missing data ` +section. To construct a DataFrame with missing data, use ``np.nan`` for those +values which are missing. Alternatively, you may pass a ``numpy.MaskedArray`` +as the data argument to the DataFrame constructor, and its masked entries will +be considered missing. + +Alternate Constructors +~~~~~~~~~~~~~~~~~~~~~~ + +.. _basics.dataframe.from_dict: + +**DataFrame.from_dict** + +``DataFrame.from_dict`` takes a dict of dicts or a dict of array-like sequences +and returns a DataFrame. It operates like the ``DataFrame`` constructor except +for the ``orient`` parameter which is ``'columns'`` by default, but which can be +set to ``'index'`` in order to use the dict keys as row labels. + +.. _basics.dataframe.from_records: + +**DataFrame.from_records** + +``DataFrame.from_records`` takes a list of tuples or an ndarray with structured +dtype. Works analogously to the normal ``DataFrame`` constructor, except that +index maybe be a specific field of the structured dtype to use as the index. +For example: + +.. ipython:: python + + data + DataFrame.from_records(data, index='C') + +.. _basics.dataframe.from_items: + +**DataFrame.from_items** + +``DataFrame.from_items`` works analogously to the form of the ``dict`` +constructor that takes a sequence of ``(key, value)`` pairs, where the keys are +column (or row, in the case of ``orient='index'``) names, and the value are the +column values (or row values). This can be useful for constructing a DataFrame +with the columns in a particular order without having to pass an explicit list +of columns: + +.. ipython:: python + + DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])]) + +If you pass ``orient='index'``, the keys will be the row labels. But in this +case you must also pass the desired column names: + +.. ipython:: python + + DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])], + orient='index', columns=['one', 'two', 'three']) + +Column selection, addition, deletion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can treat a DataFrame semantically like a dict of like-indexed Series +objects. Getting, setting, and deleting columns works with the same syntax as +the analogous dict operations: + +.. ipython:: python + + df['one'] + df['three'] = df['one'] * df['two'] + df['flag'] = df['one'] > 2 + df + +Columns can be deleted or popped like with a dict: + +.. ipython:: python + + del df['two'] + three = df.pop('three') + df + +When inserting a scalar value, it will naturally be propagated to fill the +column: + +.. ipython:: python + + df['foo'] = 'bar' + df + +When inserting a Series that does not have the same index as the DataFrame, it +will be conformed to the DataFrame's index: + +.. ipython:: python + + df['one_trunc'] = df['one'][:2] + df + +You can insert raw ndarrays but their length must match the length of the +DataFrame's index. + +By default, columns get inserted at the end. The ``insert`` function is +available to insert at a particular location in the columns: + +.. ipython:: python + + df.insert(1, 'bar', df['one']) + df + +Indexing / Selection +~~~~~~~~~~~~~~~~~~~~ +The basics of indexing are as follows: + +.. csv-table:: + :header: "Operation", "Syntax", "Result" + :widths: 30, 20, 10 + + Select column, ``df[col]``, Series + Select row by label, ``df.xs(label)`` or ``df.ix[label]``, Series + Select row by location (int), ``df.ix[loc]``, Series + Slice rows, ``df[5:10]``, DataFrame + Select rows by boolean vector, ``df[bool_vec]``, DataFrame + +Row selection, for example, returns a Series whose index is the columns of the +DataFrame: + +.. ipython:: python + + df.xs('b') + df.ix[2] + +Note if a DataFrame contains columns of multiple dtypes, the dtype of the row +will be chosen to accommodate all of the data types (dtype=object is the most +general). + +For a more exhaustive treatment of more sophisticated label-based indexing and +slicing, see the :ref:`section on indexing `. We will address the +fundamentals of reindexing / conforming to new sets of lables in the +:ref:`section on reindexing `. + +Data alignment and arithmetic +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Data alignment between DataFrame objects automatically align on **both the +columns and the index (row labels)**. Again, the resulting object will have the +union of the column and row labels. + +.. ipython:: python + + df = DataFrame(randn(10, 4), columns=['A', 'B', 'C', 'D']) + df2 = DataFrame(randn(7, 3), columns=['A', 'B', 'C']) + df + df2 + +When doing an operation between DataFrame and Series, the default behavior is +to align the Series **index** on the DataFrame **columns**, thus `broadcasting +`__ +row-wise. For example: + +.. ipython:: python + + df - df.ix[0] + +In the special case of working with time series data, if the Series is a +TimeSeries (which it will be automatically if the index contains datetime +objects), and the DataFrame index also contains dates, the broadcasting will be +column-wise: + +.. ipython:: python + + index = date_range('1/1/2000', periods=8) + df = DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + df + type(df['A']) + df - df['A'] + +Technical purity aside, this case is so common in practice that supporting the +special case is preferable to the alternative of forcing the user to transpose +and do column-based alignment like so: + +.. ipython:: python + + (df.T - df['A']).T + +For explicit control over the matching and broadcasting behavior, see the +section on :ref:`flexible binary operations `. + +Operations with scalars are just as you would expect: + +.. ipython:: python + + df * 5 + 2 + 1 / df + df ** 4 + +.. _dsintro.boolean: + +Boolean operators work as well: + +.. ipython:: python + + df1 = DataFrame({'a' : [1, 0, 1], 'b' : [0, 1, 1] }, dtype=bool) + df2 = DataFrame({'a' : [0, 1, 1], 'b' : [1, 1, 0] }, dtype=bool) + df1 & df2 + df1 | df2 + df1 ^ df2 + -df1 + +Transposing +~~~~~~~~~~~ + +To transpose, access the ``T`` attribute (also the ``transpose`` function), +similar to an ndarray: + +.. ipython:: python + + # only show the first 5 rows + df[:5].T + +DataFrame interoperability with NumPy functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _dsintro.numpy_interop: + +Elementwise NumPy ufuncs (log, exp, sqrt, ...) and various other NumPy functions +can be used with no issues on DataFrame, assuming the data within are numeric: + +.. ipython:: python + + np.exp(df) + np.asarray(df) + +The dot method on DataFrame implements matrix multiplication: + +.. ipython:: python + + df.T.dot(df) + +Similarly, the dot method on Series implements dot product: + +.. ipython:: python + + s1 = Series(np.arange(5,10)) + s1.dot(s1) + +DataFrame is not intended to be a drop-in replacement for ndarray as its +indexing semantics are quite different in places from a matrix. + +Console display +~~~~~~~~~~~~~~~ + +For very large DataFrame objects, only a summary will be printed to the console +(here I am reading a CSV version of the **baseball** dataset from the **plyr** +R package): + +.. ipython:: python + :suppress: + + # force a summary to be printed + set_printoptions(max_rows=5) + +.. ipython:: python + + baseball = read_csv('data/baseball.csv') + print baseball + +.. ipython:: python + :suppress: + + # restore GlobalPrintConfig + reset_printoptions() + +However, using ``to_string`` will return a string representation of the +DataFrame in tabular form, though it won't always fit the console width: + +.. ipython:: python + + print baseball.ix[-20:, :12].to_string() + +DataFrame column types +~~~~~~~~~~~~~~~~~~~~~~ + +.. _dsintro.column_types: + +The four main types stored in pandas objects are float, int, boolean, and +object. A convenient ``dtypes`` attribute return a Series with the data type of +each column: + +.. ipython:: python + + baseball.dtypes + +The related method ``get_dtype_counts`` will return the number of columns of +each type: + +.. ipython:: python + + baseball.get_dtype_counts() + +DataFrame column attribute access and IPython completion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If a DataFrame column label is a valid Python variable name, the column can be +accessed like attributes: + +.. ipython:: python + + df = DataFrame({'foo1' : np.random.randn(5), + 'foo2' : np.random.randn(5)}) + df + df.foo1 + +The columns are also connected to the `IPython `__ +completion mechanism so they can be tab-completed: + +.. code-block:: ipython + + In [5]: df.fo + df.foo1 df.foo2 + +.. _basics.panel: + +Panel +----- + +Panel is a somewhat less-used, but still important container for 3-dimensional +data. The term `panel data `__ is +derived from econometrics and is partially responsible for the name pandas: +pan(el)-da(ta)-s. The names for the 3 axes are intended to give some semantic +meaning to describing operations involving panel data and, in particular, +econometric analysis of panel data. However, for the strict purposes of slicing +and dicing a collection of DataFrame objects, you may find the axis names +slightly arbitrary: + + - **items**: axis 0, each item corresponds to a DataFrame contained inside + - **major_axis**: axis 1, it is the **index** (rows) of each of the + DataFrames + - **minor_axis**: axis 2, it is the **columns** of each of the DataFrames + +Construction of Panels works about like you would expect: + +From 3D ndarray with optional axis labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + wp + + +From dict of DataFrame objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + + data = {'Item1' : DataFrame(randn(4, 3)), + 'Item2' : DataFrame(randn(4, 2))} + Panel(data) + +Note that the values in the dict need only be **convertible to +DataFrame**. Thus, they can be any of the other valid inputs to DataFrame as +per above. + +One helpful factory method is ``Panel.from_dict``, which takes a +dictionary of DataFrames as above, and the following named parameters: + +.. csv-table:: + :header: "Parameter", "Default", "Description" + :widths: 10, 10, 40 + + intersect, ``False``, drops elements whose indices do not align + orient, ``items``, use ``minor`` to use DataFrames' columns as panel items + +For example, compare to the construction above: + +.. ipython:: python + + Panel.from_dict(data, orient='minor') + +Orient is especially useful for mixed-type DataFrames. If you pass a dict of +DataFrame objects with mixed-type columns, all of the data will get upcasted to +``dtype=object`` unless you pass ``orient='minor'``: + +.. ipython:: python + + df = DataFrame({'a': ['foo', 'bar', 'baz'], + 'b': np.random.randn(3)}) + df + data = {'item1': df, 'item2': df} + panel = Panel.from_dict(data, orient='minor') + panel['a'] + panel['b'] + panel['b'].dtypes + +.. note:: + + Unfortunately Panel, being less commonly used than Series and DataFrame, + has been slightly neglected feature-wise. A number of methods and options + available in DataFrame are not available in Panel. This will get worked + on, of course, in future releases. And faster if you join me in working on + the codebase. + +.. _dsintro.to_panel: + +From DataFrame using ``to_panel`` method +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This method was introduced in v0.7 to replace ``LongPanel.to_long``, and converts +a DataFrame with a two-level index to a Panel. + +.. ipython:: python + + midx = MultiIndex(levels=[['one', 'two'], ['x','y']], labels=[[1,1,0,0],[1,0,1,0]]) + df = DataFrame({'A' : [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, index=midx) + df.to_panel() + +.. _dsintro.panel_item_selection: + +Item selection / addition / deletion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to DataFrame functioning as a dict of Series, Panel is like a dict +of DataFrames: + +.. ipython:: python + + wp['Item1'] + wp['Item3'] = wp['Item1'] / wp['Item2'] + +The API for insertion and deletion is the same as for DataFrame. And as with +DataFrame, if the item is a valid python identifier, you can access it as an +attribute and tab-complete it in IPython. + +Transposing +~~~~~~~~~~~ + +A Panel can be rearranged using its ``transpose`` method (which does not make a +copy by default unless the data are heterogeneous): + +.. ipython:: python + + wp.transpose(2, 0, 1) + +Indexing / Selection +~~~~~~~~~~~~~~~~~~~~ + +.. csv-table:: + :header: "Operation", "Syntax", "Result" + :widths: 30, 20, 10 + + Select item, ``wp[item]``, DataFrame + Get slice at major_axis label, ``wp.major_xs(val)``, DataFrame + Get slice at minor_axis label, ``wp.minor_xs(val)``, DataFrame + +For example, using the earlier example data, we could do: + +.. ipython:: python + + wp['Item1'] + wp.major_xs(wp.major_axis[2]) + wp.minor_axis + wp.minor_xs('C') + +Conversion to DataFrame +~~~~~~~~~~~~~~~~~~~~~~~ + +A Panel can be represented in 2D form as a hierarchically indexed +DataFrame. See the section :ref:`hierarchical indexing ` +for more on this. To convert a Panel to a DataFrame, use the ``to_frame`` +method: + +.. ipython:: python + + panel = Panel(np.random.randn(3, 5, 4), items=['one', 'two', 'three'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['a', 'b', 'c', 'd']) + panel.to_frame() diff --git a/doc/source/faq.rst b/doc/source/faq.rst new file mode 100644 index 00000000..0f676ba6 --- /dev/null +++ b/doc/source/faq.rst @@ -0,0 +1,174 @@ +.. currentmodule:: pandas +.. _faq: + +******************************** +Frequently Asked Questions (FAQ) +******************************** + +.. ipython:: python + :suppress: + + from datetime import datetime + import numpy as np + np.random.seed(123456) + from pandas import * + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + from dateutil.relativedelta import relativedelta + from pandas.tseries.api import * + from pandas.tseries.offsets import * + import matplotlib.pyplot as plt + plt.close('all') + + +Migrating from scikits.timeseries to pandas >= 0.8.0 +---------------------------------------------------- + +Starting with pandas 0.8.0, users of scikits.timeseries should have all of the +features that they need to migrate their code to use pandas. Portions of the +scikits.timeseries codebase for implementing calendar logic and timespan +frequency conversions (but **not** resampling, that has all been implemented +from scratch from the ground up) have been ported to the pandas codebase. + +The scikits.timeseries notions of ``Date`` and ``DateArray`` are responsible +for implementing calendar logic: + +:: + + In [16]: dt = ts.Date('Q', '1984Q3') + + # sic + In [17]: dt + Out[17]: + + In [18]: dt.asfreq('D', 'start') + Out[18]: + + In [19]: dt.asfreq('D', 'end') + Out[19]: + + In [20]: dt + 3 + Out[20]: + +``Date`` and ``DateArray`` from scikits.timeseries have been reincarnated in +pandas ``Period`` and ``PeriodIndex``: + +.. ipython:: python + + pnow('D') # scikits.timeseries.now() + Period(year=2007, month=3, day=15, freq='D') + p = Period('1984Q3') + p + p.asfreq('D', 'start') + p.asfreq('D', 'end') + (p + 3).asfreq('T') + 6 * 60 + 30 + rng = period_range('1990', '2010', freq='A') + rng + rng.asfreq('B', 'end') - 3 + +.. csv-table:: + :header: "scikits.timeseries", "pandas", "Notes" + :widths: 20, 20, 60 + + Date, Period, "A span of time, from yearly through to secondly" + DateArray, PeriodIndex, "An array of timespans" + convert, resample, "Frequency conversion in scikits.timeseries" + convert_to_annual, pivot_annual, "currently supports up to daily frequency, see :issue:`736`" + + +PeriodIndex / DateArray properties and functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The scikits.timeseries ``DateArray`` had a number of information +properties. Here are the pandas equivalents: + +.. csv-table:: + :header: "scikits.timeseries", "pandas", "Notes" + :widths: 20, 60, 20 + + get_steps, ``np.diff(idx.values)``, + has_missing_dates, ``not idx.is_full``, + is_full, ``idx.is_full``, + is_valid, ``idx.is_monotonic and idx.is_unique``, + is_chronological, ``is_monotonic``, + ``arr.sort_chronologically()``, ``idx.order()``, + +Frequency conversion +~~~~~~~~~~~~~~~~~~~~ + +Frequency conversion is implemented using the ``resample`` method on TimeSeries +and DataFrame objects (multiple time series). ``resample`` also works on panels +(3D). Here is some code that resamples daily data to montly with +scikits.timeseries: + +.. ipython:: python + + import scikits.timeseries as ts + data = ts.time_series(np.random.randn(50), start_date='Jan-2000', freq='M') + data + data.convert('A', func=np.mean) + +Here is the equivalent pandas code: + +.. ipython:: python + + rng = period_range('Jan-2000', periods=50, freq='M') + data = Series(np.random.randn(50), index=rng) + data + data.resample('A', how=np.mean) + +Plotting +~~~~~~~~ + +Much of the plotting functionality of scikits.timeseries has been ported and +adopted to pandas's data structures. For example: + +.. ipython:: python + + rng = period_range('1987Q2', periods=10, freq='Q-DEC') + data = Series(np.random.randn(10), index=rng) + + @savefig skts_ts_plot.png width=4.5in + plt.figure(); data.plot() + +Converting to and from period format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use the ``to_timestamp`` and ``to_period`` instance methods. + +Treatment of missing data +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Unlike scikits.timeseries, pandas data structures are not based on NumPy's +``MaskedArray`` object. Missing data is represented as ``NaN`` in numerical +arrays and either as ``None`` or ``NaN`` in non-numerical arrays. Implementing +a version of pandas's data structures that use MaskedArray is possible but +would require the involvement of a dedicated maintainer. Active pandas +developers are not interested in this. + +Resampling with timestamps and periods +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``resample`` has a ``kind`` argument which allows you to resample time series +with a DatetimeIndex to PeriodIndex: + +.. ipython:: python + + rng = date_range('1/1/2000', periods=200, freq='D') + data = Series(np.random.randn(200), index=rng) + data[:10] + data.index + data.resample('M', kind='period') + +Similarly, resampling from periods to timestamps is possible with an optional +interval (``'start'`` or ``'end'``) convention: + +.. ipython:: python + + rng = period_range('Jan-2000', periods=50, freq='M') + data = Series(np.random.randn(50), index=rng) + resampled = data.resample('A', kind='timestamp', convention='end') + resampled.index + + diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst new file mode 100644 index 00000000..f4c0eae4 --- /dev/null +++ b/doc/source/gotchas.rst @@ -0,0 +1,243 @@ +.. currentmodule:: pandas +.. _gotchas: + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + +******************* +Caveats and Gotchas +******************* + +``NaN``, Integer ``NA`` values and ``NA`` type promotions +--------------------------------------------------------- + +Choice of ``NA`` representation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For lack of ``NA`` (missing) support from the ground up in NumPy and Python in +general, we were given the difficult choice between either + +- A *masked array* solution: an array of data and an array of boolean values + indicating whether a value +- Using a special sentinel value, bit pattern, or set of sentinel values to + denote ``NA`` across the dtypes + +For many reasons we chose the latter. After years of production use it has +proven, at least in my opinion, to be the best decision given the state of +affairs in NumPy and Python in general. The special value ``NaN`` +(Not-A-Number) is used everywhere as the ``NA`` value, and there are API +functions ``isnull`` and ``notnull`` which can be used across the dtypes to +detect NA values. + +However, it comes with it a couple of trade-offs which I most certainly have +not ignored. + +Support for integer ``NA`` +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In the absence of high performance ``NA`` support being built into NumPy from +the ground up, the primary casualty is the ability to represent NAs in integer +arrays. For example: + +.. ipython:: python + + s = Series([1, 2, 3, 4, 5], index=list('abcde')) + s + s.dtype + + s2 = s.reindex(['a', 'b', 'c', 'f', 'u']) + s2 + s2.dtype + +This trade-off is made largely for memory and performance reasons, and also so +that the resulting Series continues to be "numeric". One possibility is to use +``dtype=object`` arrays instead. + +``NA`` type promotions +~~~~~~~~~~~~~~~~~~~~~~ + +When introducing NAs into an existing Series or DataFrame via ``reindex`` or +some other means, boolean and integer types will be promoted to a different +dtype in order to store the NAs. These are summarized by this table: + +.. csv-table:: + :header: "Typeclass","Promotion dtype for storing NAs" + :widths: 40,60 + + ``floating``, no change + ``object``, no change + ``integer``, cast to ``float64`` + ``boolean``, cast to ``object`` + +While this may seem like a heavy trade-off, in practice I have found very few +cases where this is an issue in practice. Some explanation for the motivation +here in the next section. + +Why not make NumPy like R? +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many people have suggested that NumPy should simply emulate the ``NA`` support +present in the more domain-specific statistical programming langauge `R +`__. Part of the reason is the NumPy type hierarchy: + +.. csv-table:: + :header: "Typeclass","Dtypes" + :widths: 30,70 + :delim: | + + ``numpy.floating`` | ``float16, float32, float64, float128`` + ``numpy.integer`` | ``int8, int16, int32, int64`` + ``numpy.unsignedinteger`` | ``uint8, uint16, uint32, uint64`` + ``numpy.object_`` | ``object_`` + ``numpy.bool_`` | ``bool_`` + ``numpy.character`` | ``string_, unicode_`` + +The R language, by contrast, only has a handful of built-in data types: +``integer``, ``numeric`` (floating-point), ``character``, and +``boolean``. ``NA`` types are implemented by reserving special bit patterns for +each type to be used as the missing value. While doing this with the full NumPy +type hierarchy would be possible, it would be a more substantial trade-off +(especially for the 8- and 16-bit data types) and implementation undertaking. + +An alternate approach is that of using masked arrays. A masked array is an +array of data with an associated boolean *mask* denoting whether each value +should be considered ``NA`` or not. I am personally not in love with this +approach as I feel that overall it places a fairly heavy burden on the user and +the library implementer. Additionally, it exacts a fairly high performance cost +when working with numerical data compared with the simple approach of using +``NaN``. Thus, I have chosen the Pythonic "practicality beats purity" approach +and traded integer ``NA`` capability for a much simpler approach of using a +special value in float and object arrays to denote ``NA``, and promoting +integer arrays to floating when NAs must be introduced. + +Integer indexing +---------------- + +Label-based slicing conventions +------------------------------- + +Non-monotonic indexes require exact matches +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Endpoints are inclusive +~~~~~~~~~~~~~~~~~~~~~~~ + +Compared with standard Python sequence slicing in which the slice endpoint is +not inclusive, label-based slicing in pandas **is inclusive**. The primary +reason for this is that it is often not possible to easily determine the +"successor" or next element after a particular label in an index. For example, +consider the following Series: + +.. ipython:: python + + s = Series(randn(6), index=list('abcdef')) + s + +Suppose we wished to slice from ``c`` to ``e``, using integers this would be + +.. ipython:: python + + s[2:5] + +However, if you only had ``c`` and ``e``, determining the next element in the +index can be somewhat complicated. For example, the following does not work: + +:: + + s.ix['c':'e'+1] + +A very common use case is to limit a time series to start and end at two +specific dates. To enable this, we made the design design to make label-based +slicing include both endpoints: + +.. ipython:: python + + s.ix['c':'e'] + +This is most definitely a "practicality beats purity" sort of thing, but it is +something to watch out for if you expect label-based slicing to behave exactly +in the way that standard Python integer slicing works. + +Miscellaneous indexing gotchas +------------------------------ + +Reindex versus ix gotchas +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Many users will find themselves using the ``ix`` indexing capabilities as a +concise means of selecting data from a pandas object: + +.. ipython:: python + + df = DataFrame(randn(6, 4), columns=['one', 'two', 'three', 'four'], + index=list('abcdef')) + df + df.ix[['b', 'c', 'e']] + +This is, of course, completely equivalent *in this case* to using th +``reindex`` method: + +.. ipython:: python + + df.reindex(['b', 'c', 'e']) + +Some might conclude that ``ix`` and ``reindex`` are 100% equivalent based on +this. This is indeed true **except in the case of integer indexing**. For +example, the above operation could alternately have been expressed as: + +.. ipython:: python + + df.ix[[1, 2, 4]] + +If you pass ``[1, 2, 4]`` to ``reindex`` you will get another thing entirely: + +.. ipython:: python + + df.reindex([1, 2, 4]) + +So it's important to remember that ``reindex`` is **strict label indexing +only**. This can lead to some potentially surprising results in pathological +cases where an index contains, say, both integers and strings: + +.. ipython:: python + + s = Series([1, 2, 3], index=['a', 0, 1]) + s + s.ix[[0, 1]] + s.reindex([0, 1]) + +Because the index in this case does not contain solely integers, ``ix`` falls +back on integer indexing. By contrast, ``reindex`` only looks for the values +passed in the index, thus finding the integers ``0`` and ``1``. While it would +be possible to insert some logic to check whether a passed sequence is all +contained in the index, that logic would exact a very high cost in large data +sets. + +Timestamp limitations +--------------------- + +Minimum and maximum timestamps +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since pandas represents timestamps in nanosecond resolution, the timespan that +can be represented using a 64-bit integer is limited to approximately 584 years: + +.. ipython:: python + + begin = Timestamp(-9223285636854775809L) + begin + end = Timestamp(np.iinfo(np.int64).max) + end + +If you need to represent time series data outside the nanosecond timespan, use +PeriodIndex: + +.. ipython:: python + + span = period_range('1215-01-01', '1381-01-01', freq='D') + span diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst new file mode 100644 index 00000000..831fafe5 --- /dev/null +++ b/doc/source/groupby.rst @@ -0,0 +1,606 @@ +.. currentmodule:: pandas +.. _groupby: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + +***************************** +Group By: split-apply-combine +***************************** + +By "group by" we are refer to a process involving one or more of the following +steps + + - **Splitting** the data into groups based on some criteria + - **Applying** a function to each group independently + - **Combining** the results into a data structure + +Of these, the split step is the most straightforward. In fact, in many +situations you may wish to split the data set into groups and do something with +those groups yourself. In the apply step, we might wish to one of the +following: + + - **Aggregation**: computing a summary statistic (or statistics) about each + group. Some examples: + + - Compute group sums or means + - Compute group sizes / counts + + - **Transformation**: perform some group-specific computations and return a + like-indexed. Some examples: + + - Standardizing data (zscore) within group + - Filling NAs within groups with a value derived from each group + + - Some combination of the above: GroupBy will examine the results of the apply + step and try to return a sensibly combined result if it doesn't fit into + either of the above two categories + +Since the set of object instance method on pandas data structures are generally +rich and expressive, we often simply want to invoke, say, a DataFrame function +on each group. The name GroupBy should be quite familiar to those who have used +a SQL-based tool (or ``itertools``), in which you can write code like: + +.. code-block:: sql + + SELECT Column1, Column2, mean(Column3), sum(Column4) + FROM SomeTable + GROUP BY Column1, Column2 + +We aim to make operations like this natural and easy to express using +pandas. We'll address each area of GroupBy functionality then provide some +non-trivial examples / use cases. + +.. _groupby.split: + +Splitting an object into groups +------------------------------- + +pandas objects can be split on any of their axes. The abstract definition of +grouping is to provide a mapping of labels to group names. To create a GroupBy +object (more on what the GroupBy object is later), you do the following: + +.. code-block:: ipython + + # default is axis=0 + >>> grouped = obj.groupby(key) + >>> grouped = obj.groupby(key, axis=1) + >>> grouped = obj.groupby([key1, key2]) + +The mapping can be specified many different ways: + + - A Python function, to be called on each of the axis labels + - A list or NumPy array of the same length as the selected axis + - A dict or Series, providing a ``label -> group name`` mapping + - For DataFrame objects, a string indicating a column to be used to group. Of + course ``df.groupby('A')`` is just syntactic sugar for + ``df.groupby(df['A'])``, but it makes life simpler + - A list of any of the above things + +Collectively we refer to the grouping objects as the **keys**. For example, +consider the following DataFrame: + +.. ipython:: python + + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : randn(8), 'D' : randn(8)}) + df + +We could naturally group by either the ``A`` or ``B`` columns or both: + +.. ipython:: python + + grouped = df.groupby('A') + grouped = df.groupby(['A', 'B']) + +These will split the DataFrame on its index (rows). We could also split by the +columns: + +.. ipython:: + + In [4]: def get_letter_type(letter): + ...: if letter.lower() in 'aeiou': + ...: return 'vowel' + ...: else: + ...: return 'consonant' + ...: + + In [5]: grouped = df.groupby(get_letter_type, axis=1) + +Starting with 0.8, pandas Index objects now supports duplicate values. If a +non-unique index is used as the group key in a groupby operation, all values +for the same index value will be considered to be in one group and thus the +output of aggregation functions will only contain unique index values: + +.. ipython:: python + + lst = [1, 2, 3, 1, 2, 3] + + s = Series([1, 2, 3, 10, 20, 30], lst) + + grouped = s.groupby(level=0) + + grouped.first() + + grouped.last() + + grouped.sum() + +Note that **no splitting occurs** until it's needed. Creating the GroupBy object +only verifies that you've passed a valid mapping. + +.. note:: + + Many kinds of complicated data manipulations can be expressed in terms of + GroupBy operations (though can't be guaranteed to be the most + efficient). You can get quite creative with the label mapping functions. + +.. _groupby.attributes: + +GroupBy object attributes +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``groups`` attribute is a dict whose keys are the computed unique groups +and corresponding values being the axis labels belonging to each group. In the +above example we have: + +.. ipython:: python + + df.groupby('A').groups + df.groupby(get_letter_type, axis=1).groups + +Calling the standard Python ``len`` function on the GroupBy object just returns +the length of the ``groups`` dict, so it is largely just a convenience: + +.. ipython:: python + + grouped = df.groupby(['A', 'B']) + grouped.groups + len(grouped) + +By default the group keys are sorted during the groupby operation. You may +however pass ``sort``=``False`` for potential speedups: + +.. ipython:: python + + df2 = DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]}) + df2.groupby(['X'], sort=True).sum() + df2.groupby(['X'], sort=False).sum() + +.. _groupby.multiindex: + +GroupBy with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~ + +With :ref:`hierarchically-indexed data `, it's quite +natural to group by one of the levels of the hierarchy. + +.. ipython:: python + :suppress: + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = zip(*arrays) + tuples + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + s = Series(randn(8), index=index) + +.. ipython:: python + + s + grouped = s.groupby(level=0) + grouped.sum() + +If the MultiIndex has names specified, these can be passed instead of the level +number: + +.. ipython:: python + + s.groupby(level='second').sum() + +The aggregation functions such as ``sum`` will take the level parameter +directly. Additionally, the resulting index will be named according to the +chosen level: + +.. ipython:: python + + s.sum(level='second') + +Also as of v0.6, grouping with multiple levels is supported. + +.. ipython:: python + :suppress: + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + s = Series(randn(8), index=index) + +.. ipython:: python + + s + s.groupby(level=['first','second']).sum() + +More on the ``sum`` function and aggregation later. + +DataFrame column selection in GroupBy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Once you have created the GroupBy object from a DataFrame, for example, you +might want to do something different for each of the columns. Thus, using +``[]`` similar to getting a column from a DataFrame, you can do: + +.. ipython:: python + + grouped = df.groupby(['A']) + grouped_C = grouped['C'] + grouped_D = grouped['D'] + +This is mainly syntactic sugar for the alternative and much more verbose: + +.. ipython:: python + + df['C'].groupby(df['A']) + +Additionally this method avoids recomputing the internal grouping information +derived from the passed key. + +.. _groupby.iterating: + +Iterating through groups +------------------------ + +With the GroupBy object in hand, iterating through the grouped data is very +natural and functions similarly to ``itertools.groupby``: + +.. ipython:: + + In [4]: grouped = df.groupby('A') + + In [5]: for name, group in grouped: + ...: print name + ...: print group + ...: + +In the case of grouping by multiple keys, the group name will be a tuple: + +.. ipython:: + + In [5]: for name, group in df.groupby(['A', 'B']): + ...: print name + ...: print group + ...: + +It's standard Python-fu but remember you can unpack the tuple in the for loop +statement if you wish: ``for (k1, k2), group in grouped:``. + +.. _groupby.aggregate: + +Aggregation +----------- + +Once the GroupBy object has been created, several methods are available to +perform a computation on the grouped data. An obvious one is aggregation via +the ``aggregate`` or equivalently ``agg`` method: + +.. ipython:: python + + grouped = df.groupby('A') + grouped.aggregate(np.sum) + + grouped = df.groupby(['A', 'B']) + grouped.aggregate(np.sum) + +As you can see, the result of the aggregation will have the group names as the +new index along the grouped axis. In the case of multiple keys, the result is a +:ref:`MultiIndex ` by default, though this can be +changed by using the ``as_index`` option: + +.. ipython:: python + + grouped = df.groupby(['A', 'B'], as_index=False) + grouped.aggregate(np.sum) + + df.groupby('A', as_index=False).sum() + +Note that you could use the ``reset_index`` DataFrame function to achieve the +same result as the column names are stored in the resulting ``MultiIndex``: + +.. ipython:: python + + df.groupby(['A', 'B']).sum().reset_index() + +Another simple aggregation example is to compute the size of each group. +This is included in GroupBy as the ``size`` method. It returns a Series whose +index are the group names and whose values are the sizes of each group. + +.. ipython:: python + + grouped.size() + + +.. _groupby.aggregate.multifunc: + +Applying multiple functions at once +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With grouped Series you can also pass a list or dict of functions to do +aggregation with, outputting a DataFrame: + +.. ipython:: python + + grouped = df.groupby('A') + grouped['C'].agg([np.sum, np.mean, np.std]) + +If a dict is passed, the keys will be used to name the columns. Otherwise the +function's name (stored in the function object) will be used. + +.. ipython:: python + + grouped['D'].agg({'result1' : np.sum, + 'result2' : np.mean}) + +On a grouped DataFrame, you can pass a list of functions to apply to each +column, which produces an aggregated result with a hierarchical index: + +.. ipython:: python + + grouped.agg([np.sum, np.mean, np.std]) + +Passing a dict of functions has different behavior by default, see the next +section. + +Applying different functions to DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By passing a dict to ``aggregate`` you can apply a different aggregation to the +columns of a DataFrame: + +.. ipython:: python + + grouped.agg({'C' : np.sum, + 'D' : lambda x: np.std(x, ddof=1)}) + +The function names can also be strings. In order for a string to be valid it +must be either implemented on GroupBy or available via :ref:`dispatching +`: + +.. ipython:: python + + grouped.agg({'C' : 'sum', 'D' : 'std'}) + +.. _groupby.aggregate.cython: + +Cython-optimized aggregation functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some common aggregations, currently only ``sum``, ``mean``, and ``std``, have +optimized Cython implementations: + +.. ipython:: python + + df.groupby('A').sum() + df.groupby(['A', 'B']).mean() + +Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above +code would work even without the special versions via dispatching (see below). + +.. _groupby.transform: + +Transformation +-------------- + +The ``transform`` method returns an object that is indexed the same (same size) +as the one being grouped. Thus, the passed transform function should return a +result that is the same size as the group chunk. For example, suppose we wished +to standardize the data within each group: + +.. ipython:: python + + index = date_range('10/1/1999', periods=1100) + ts = Series(np.random.normal(0.5, 2, 1100), index) + ts = rolling_mean(ts, 100, 100).dropna() + + ts.head() + ts.tail() + key = lambda x: x.year + zscore = lambda x: (x - x.mean()) / x.std() + transformed = ts.groupby(key).transform(zscore) + +We would expect the result to now have mean 0 and standard deviation 1 within +each group, which we can easily check: + +.. ipython:: python + + # Original Data + grouped = ts.groupby(key) + grouped.mean() + grouped.std() + + # Transformed Data + grouped_trans = transformed.groupby(key) + grouped_trans.mean() + grouped_trans.std() + +We can also visually compare the original and transformed data sets. + +.. ipython:: python + + compare = DataFrame({'Original': ts, 'Transformed': transformed}) + + @savefig groupby_transform_plot.png width=4in + compare.plot() + +Another common data transform is to replace missing data with the group mean. + +.. ipython:: python + :suppress: + + cols = ['A', 'B', 'C'] + values = randn(1000, 3) + values[np.random.randint(0, 1000, 100), 0] = np.nan + values[np.random.randint(0, 1000, 50), 1] = np.nan + values[np.random.randint(0, 1000, 200), 2] = np.nan + data_df = DataFrame(values, columns=cols) + +.. ipython:: python + + data_df + + countries = np.array(['US', 'UK', 'GR', 'JP']) + key = countries[np.random.randint(0, 4, 1000)] + + grouped = data_df.groupby(key) + + # Non-NA count in each group + grouped.count() + + f = lambda x: x.fillna(x.mean()) + + transformed = grouped.transform(f) + +We can verify that the group means have not changed in the transformed data +and that the transformed data contains no NAs. + +.. ipython:: python + + grouped_trans = transformed.groupby(key) + + grouped.mean() # original group means + grouped_trans.mean() # transformation did not change group means + + grouped.count() # original has some missing data points + grouped_trans.count() # counts after transformation + grouped_trans.size() # Verify non-NA count equals group size + +.. _groupby.dispatch: + +Dispatching to instance methods +------------------------------- + +When doing an aggregation or transformation, you might just want to call an +instance method on each data group. This is pretty easy to do by passing lambda +functions: + +.. ipython:: python + + grouped = df.groupby('A') + grouped.agg(lambda x: x.std()) + +But, it's rather verbose and can be untidy if you need to pass additional +arguments. Using a bit of metaprogramming cleverness, GroupBy now has the +ability to "dispatch" method calls to the groups: + +.. ipython:: python + + grouped.std() + +What is actually happening here is that a function wrapper is being +generated. When invoked, it takes any passed arguments and invokes the function +with any arguments on each group (in the above example, the ``std`` +function). The results are then combined together much in the style of ``agg`` +and ``transform`` (it actually uses ``apply`` to infer the gluing, documented +next). This enables some operations to be carried out rather succinctly: + +.. ipython:: python + + tsdf = DataFrame(randn(1000, 3), + index=date_range('1/1/2000', periods=1000), + columns=['A', 'B', 'C']) + tsdf.ix[::2] = np.nan + grouped = tsdf.groupby(lambda x: x.year) + grouped.fillna(method='pad') + +In this example, we chopped the collection of time series into yearly chunks +then independently called :ref:`fillna ` on the +groups. + +.. _groupby.apply: + +Flexible ``apply`` +------------------ + +Some operations on the grouped data might not fit into either the aggregate or +transform categories. Or, you may simply want GroupBy to infer how to combine +the results. For these, use the ``apply`` function, which can be substituted +for both ``aggregate`` and ``transform`` in many standard use cases. However, +``apply`` can handle some exceptional use cases, for example: + +.. ipython:: python + + df + grouped = df.groupby('A') + + # could also just call .describe() + grouped['C'].apply(lambda x: x.describe()) + +The dimension of the returned result can also change: + +.. ipython:: + + In [8]: grouped = df.groupby('A')['C'] + + In [10]: def f(group): + ....: return DataFrame({'original' : group, + ....: 'demeaned' : group - group.mean()}) + ....: + + In [11]: grouped.apply(f) + + +Other useful features +--------------------- + +Automatic exclusion of "nuisance" columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Again consider the example DataFrame we've been looking at: + +.. ipython:: python + + df + +Supposed we wished to compute the standard deviation grouped by the ``A`` +column. There is a slight problem, namely that we don't care about the data in +column ``B``. We refer to this as a "nuisance" column. If the passed +aggregation function can't be applied to some columns, the troublesome columns +will be (silently) dropped. Thus, this does not pose any problems: + +.. ipython:: python + + df.groupby('A').std() + +NA group handling +~~~~~~~~~~~~~~~~~ + +If there are any NaN values in the grouping key, these will be automatically +excluded. So there will never be an "NA group". This was not the case in older +versions of pandas, but users were generally discarding the NA group anyway +(and supporting it was an implementation headache). + +Grouping with ordered factors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Categorical variables represented as instance of pandas's ``Factor`` class can +be used as group keys. If so, the order of the levels will be preserved: + +.. ipython:: python + + data = Series(np.random.randn(100)) + + factor = qcut(data, [0, .25, .5, .75, 1.]) + + data.groupby(factor).mean() diff --git a/doc/source/index.rst b/doc/source/index.rst new file mode 100644 index 00000000..b4b9231b --- /dev/null +++ b/doc/source/index.rst @@ -0,0 +1,132 @@ +.. Pandas documentation master file, created by + +********************************************* +pandas: powerful Python data analysis toolkit +********************************************* + +`PDF Version `__ + +.. module:: pandas + +**Date**: |today| **Version**: |version| + +**Binary Installers:** http://pypi.python.org/pypi/pandas + +**Source Repository:** http://github.com/pydata/pandas + +**Issues & Ideas:** https://github.com/pydata/pandas/issues + +**Q&A Support:** http://stackoverflow.com/questions/tagged/pandas + +**Developer Mailing List:** http://groups.google.com/group/pystatsmodels + +**pandas** is a `Python `__ package providing fast, +flexible, and expressive data structures designed to make working with +"relational" or "labeled" data both easy and intuitive. It aims to be the +fundamental high-level building block for doing practical, **real world** data +analysis in Python. Additionally, it has the broader goal of becoming **the +most powerful and flexible open source data analysis / manipulation tool +available in any language**. It is already well on its way toward this goal. + +pandas is well suited for many different kinds of data: + + - Tabular data with heterogeneously-typed columns, as in an SQL table or + Excel spreadsheet + - Ordered and unordered (not necessarily fixed-frequency) time series data. + - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and + column labels + - Any other form of observational / statistical data sets. The data actually + need not be labeled at all to be placed into a pandas data structure + +The two primary data structures of pandas, :class:`Series` (1-dimensional) +and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use +cases in finance, statistics, social science, and many areas of +engineering. For R users, :class:`DataFrame` provides everything that R's +``data.frame`` provides and much more. pandas is built on top of `NumPy +`__ and is intended to integrate well within a scientific +computing environment with many other 3rd party libraries. + +Here are just a few of the things that pandas does well: + + - Easy handling of **missing data** (represented as NaN) in floating point as + well as non-floating point data + - Size mutability: columns can be **inserted and deleted** from DataFrame and + higher dimensional objects + - Automatic and explicit **data alignment**: objects can be explicitly + aligned to a set of labels, or the user can simply ignore the labels and + let `Series`, `DataFrame`, etc. automatically align the data for you in + computations + - Powerful, flexible **group by** functionality to perform + split-apply-combine operations on data sets, for both aggregating and + transforming data + - Make it **easy to convert** ragged, differently-indexed data in other + Python and NumPy data structures into DataFrame objects + - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** + of large data sets + - Intuitive **merging** and **joining** data sets + - Flexible **reshaping** and pivoting of data sets + - **Hierarchical** labeling of axes (possible to have multiple labels per + tick) + - Robust IO tools for loading data from **flat files** (CSV and delimited), + Excel files, databases, and saving / loading data from the ultrafast **HDF5 + format** + - **Time series**-specific functionality: date range generation and frequency + conversion, moving window statistics, moving window linear regressions, + date shifting and lagging, etc. + +Many of these principles are here to address the shortcomings frequently +experienced using other languages / scientific research environments. For data +scientists, working with data is typically divided into multiple stages: +munging and cleaning data, analyzing / modeling it, then organizing the results +of the analysis into a form suitable for plotting or tabular display. pandas +is the ideal tool for all of these tasks. + +Some other notes + + - pandas is **fast**. Many of the low-level algorithmic bits have been + extensively tweaked in `Cython `__ code. However, as with + anything else generalization usually sacrifices performance. So if you focus + on one feature for your application you may be able to create a faster + specialized tool. + + - pandas will soon become a dependency of `statsmodels + `__, making it an important part of the + statistical computing ecosystem in Python. + + - pandas has been used extensively in production in financial applications. + +.. note:: + + This documentation assumes general familiarity with NumPy. If you haven't + used NumPy much or at all, do invest some time in `learning about NumPy + `__ first. + +See the package overview for more detail about what's in the library. + + +.. toctree:: + :hidden: + :maxdepth: 3 + + whatsnew + install + faq + overview + dsintro + basics + indexing + computation + missing_data + groupby + merging + reshaping + timeseries + visualization + io + sparse + gotchas + r_interface + related + comparison_with_r + api + diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst new file mode 100644 index 00000000..c2ef0d74 --- /dev/null +++ b/doc/source/indexing.rst @@ -0,0 +1,1011 @@ +.. _indexing: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + np.random.seed(123456) + from pandas import * + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + +*************************** +Indexing and selecting data +*************************** + +The axis labeling information in pandas objects serves many purposes: + + - Identifies data (i.e. provides *metadata*) using known indicators, + important for for analysis, visualization, and interactive console display + - Enables automatic and explicit data alignment + - Allows intuitive getting and setting of subsets of the data set + +In this section / chapter, we will focus on the final point: namely, how to +slice, dice, and generally get and set subsets of pandas objects. The primary +focus will be on Series and DataFrame as they have received more development +attention in this area. Expect more work to be invested higher-dimensional data +structures (including Panel) in the future, especially in label-based advanced +indexing. + +.. _indexing.basics: + +Basics +------ + +As mentioned when introducing the data structures in the :ref:`last section +`, the primary function of indexing with ``[]`` (a.k.a. ``__getitem__`` +for those familiar with implementing class behavior in Python) is selecting out +lower-dimensional slices. Thus, + + - **Series**: ``series[label]`` returns a scalar value + - **DataFrame**: ``frame[colname]`` returns a Series corresponding to the + passed column name + - **Panel**: ``panel[itemname]`` returns a DataFrame corresponding to the + passed item name + +Here we construct a simple time series data set to use for illustrating the +indexing functionality: + +.. ipython:: python + + dates = np.asarray(date_range('1/1/2000', periods=8)) + df = DataFrame(randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D']) + df + panel = Panel({'one' : df, 'two' : df - df.mean()}) + panel + +.. note:: + + None of the indexing functionality is time series specific unless + specifically stated. + +Thus, as per above, we have the most basic indexing using ``[]``: + +.. ipython:: python + + s = df['A'] + s[dates[5]] + panel['two'] + + +.. _indexing.basics.get_value: + +Fast scalar value getting and setting +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Since indexing with ``[]`` must handle a lot of cases (single-label access, +slicing, boolean indexing, etc.), it has a bit of overhead in order to figure +out what you're asking for. If you only want to access a scalar value, the +fastest way is to use the ``get_value`` method, which is implemented on all of +the data structures: + +.. ipython:: python + + s.get_value(dates[5]) + df.get_value(dates[5], 'A') + +There is an analogous ``set_value`` method which has the additional capability +of enlarging an object. This method *always* returns a reference to the object +it modified, which in the fast of enlargement, will be a **new object**: + +.. ipython:: python + + df.set_value(dates[5], 'E', 7) + +Additional Column Access +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.columns.multiple: + +.. _indexing.df_cols: + +You may access a column on a dataframe directly as an attribute: + +.. ipython:: python + + df.A + +If you are using the IPython environment, you may also use tab-completion to +see the accessible columns of a DataFrame. + +You can pass a list of columns to ``[]`` to select columns in that order: +If a column is not contained in the DataFrame, an exception will be +raised. Multiple columns can also be set in this manner: + +.. ipython:: python + + df + df[['B', 'A']] = df[['A', 'B']] + df + +You may find this useful for applying a transform (in-place) to a subset of the +columns. + +Data slices on other axes +~~~~~~~~~~~~~~~~~~~~~~~~~ + +It's certainly possible to retrieve data slices along the other axes of a +DataFrame or Panel. We tend to refer to these slices as +*cross-sections*. DataFrame has the ``xs`` function for retrieving rows as +Series and Panel has the analogous ``major_xs`` and ``minor_xs`` functions for +retrieving slices as DataFrames for a given ``major_axis`` or ``minor_axis`` +label, respectively. + +.. ipython:: python + + date = dates[5] + df.xs(date) + panel.major_xs(date) + panel.minor_xs('A') + + +Slicing ranges +~~~~~~~~~~~~~~ + +The most robust and consistent way of slicing ranges along arbitrary axes is +described in the :ref:`Advanced indexing ` section detailing +the ``.ix`` method. For now, we explain the semantics of slicing using the +``[]`` operator. + +With Series, the syntax works exactly as with an ndarray, returning a slice of +the values and the corresponding labels: + +.. ipython:: python + + s[:5] + s[::2] + s[::-1] + +Note that setting works as well: + +.. ipython:: python + + s2 = s.copy() + s2[:5] = 0 + s2 + +With DataFrame, slicing inside of ``[]`` **slices the rows**. This is provided +largely as a convenience since it is such a common operation. + +.. ipython:: python + + df[:3] + df[::-1] + +Boolean indexing +~~~~~~~~~~~~~~~~ + +.. _indexing.boolean: + +Another common operation is the use of boolean vectors to filter the data. + +Using a boolean vector to index a Series works exactly as in a numpy ndarray: + +.. ipython:: python + + s[s > 0] + s[(s < 0) & (s > -0.5)] + +You may select rows from a DataFrame using a boolean vector the same length as +the DataFrame's index (for example, something derived from one of the columns +of the DataFrame): + +.. ipython:: python + + df[df['A'] > 0] + +Consider the ``isin`` method of Series, which returns a boolean vector that is +true wherever the Series elements exist in the passed list. This allows you to +select rows where one or more columns have values you want: + +.. ipython:: python + + df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : randn(7)}) + df2[df2['a'].isin(['one', 'two'])] + +List comprehensions and ``map`` method of Series can also be used to produce +more complex criteria: + +.. ipython:: python + + # only want 'two' or 'three' + criterion = df2['a'].map(lambda x: x.startswith('t')) + + df2[criterion] + + # equivalent but slower + df2[[x.startswith('t') for x in df2['a']]] + + # Multiple criteria + df2[criterion & (df2['b'] == 'x')] + + +Note, with the :ref:`advanced indexing ` ``ix`` method, you +may select along more than one axis using boolean vectors combined with other +indexing expressions. + +Indexing a DataFrame with a boolean DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may wish to set values on a DataFrame based on some boolean criteria +derived from itself or another DataFrame or set of DataFrames. This can be done +intuitively like so: + +.. ipython:: python + + df2 = df.copy() + df2 < 0 + df2[df2 < 0] = 0 + df2 + +Note that such an operation requires that the boolean DataFrame is indexed +exactly the same. + + +Take Methods +~~~~~~~~~~~~ + +.. _indexing.take: + +Similar to numpy ndarrays, pandas Index, Series, and DataFrame also provides +the ``take`` method that retrieves elements along a given axis at the given +indices. The given indices must be either a list or an ndarray of integer +index positions. + +.. ipython:: python + + index = Index(randint(0, 1000, 10)) + index + + positions = [0, 9, 3] + + index[positions] + index.take(positions) + + ser = Series(randn(10)) + + ser.ix[positions] + ser.take(positions) + +For DataFrames, the given indices should be a 1d list or ndarray that specifies +row or column positions. + +.. ipython:: python + + frm = DataFrame(randn(5, 3)) + + frm.take([1, 4, 3]) + + frm.take([0, 2], axis=1) + +It is important to note that the ``take`` method on pandas objects are not +intended to work on boolean indices and may return unexpected results. + +.. ipython:: python + + arr = randn(10) + arr.take([False, False, True, True]) + arr[[0, 1]] + + ser = Series(randn(10)) + ser.take([False, False, True, True]) + ser.ix[[0, 1]] + +Finally, as a small note on performance, because the ``take`` method handles +a narrower range of inputs, it can offer performance that is a good deal +faster than fancy indexing. + +.. ipython:: + + arr = randn(10000, 5) + indexer = np.arange(10000) + random.shuffle(indexer) + + timeit arr[indexer] + timeit arr.take(indexer, axis=0) + + ser = Series(arr[:, 0]) + timeit ser.ix[indexer] + timeit ser.take(indexer) + +Duplicate Data +~~~~~~~~~~~~~~ + +.. _indexing.duplicate: + +If you want to identify and remove duplicate rows in a DataFrame, there are +two methods that will help: ``duplicated`` and ``drop_duplicates``. Each +takes as an argument the columns to use to identify duplicated rows. + +``duplicated`` returns a boolean vector whose length is the number of rows, and +which indicates whether a row is duplicated. + +``drop_duplicates`` removes duplicate rows. + +By default, the first observed row of a duplicate set is considered unique, but +each method has a ``take_last`` parameter that indicates the last observed row +should be taken instead. + +.. ipython:: python + + df2 = DataFrame({'a' : ['one', 'one', 'two', 'three', 'two', 'one', 'six'], + 'b' : ['x', 'y', 'y', 'x', 'y', 'x', 'x'], + 'c' : np.random.randn(7)}) + df2.duplicated(['a','b']) + df2.drop_duplicates(['a','b']) + df2.drop_duplicates(['a','b'], take_last=True) + +.. _indexing.dictionarylike: + +Dictionary-like ``get`` method +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Each of Series, DataFrame, and Panel have a ``get`` method which can return a +default value. + +.. ipython:: python + + s = Series([1,2,3], index=['a','b','c']) + s.get('a') # equivalent to s['a'] + s.get('x', default=-1) + +.. _indexing.advanced: + +Advanced indexing with labels +----------------------------- + +We have avoided excessively overloading the ``[]`` / ``__getitem__`` operator +to keep the basic functionality of the pandas objects straightforward and +simple. However, there are often times when you may wish get a subset (or +analogously set a subset) of the data in a way that is not straightforward +using the combination of ``reindex`` and ``[]``. Complicated setting operations +are actually quite difficult because ``reindex`` usually returns a copy. + +By *advanced* indexing we are referring to a special ``.ix`` attribute on +pandas objects which enable you to do getting/setting operations on a +DataFrame, for example, with matrix/ndarray-like semantics. Thus you can +combine the following kinds of indexing: + + - An integer or single label, e.g. ``5`` or ``'a'`` + - A list or array of labels ``['a', 'b', 'c']`` or integers ``[4, 3, 0]`` + - A slice object with ints ``1:7`` or labels ``'a':'f'`` + - A boolean array + +We'll illustrate all of these methods. First, note that this provides a concise +way of reindexing on multiple axes at once: + +.. ipython:: python + + subindex = dates[[3,4,5]] + df.reindex(index=subindex, columns=['C', 'B']) + df.ix[subindex, ['C', 'B']] + +Assignment / setting values is possible when using ``ix``: + +.. ipython:: python + + df2 = df.copy() + df2.ix[subindex, ['C', 'B']] = 0 + df2 + +Indexing with an array of integers can also be done: + +.. ipython:: python + + df.ix[[4,3,1]] + df.ix[dates[[4,3,1]]] + +**Slicing** has standard Python semantics for integer slices: + +.. ipython:: python + + df.ix[1:7, :2] + +Slicing with labels is semantically slightly different because the slice start +and stop are **inclusive** in the label-based case: + +.. ipython:: python + + date1, date2 = dates[[2, 4]] + print date1, date2 + df.ix[date1:date2] + df['A'].ix[date1:date2] + +Getting and setting rows in a DataFrame, especially by their location, is much +easier: + +.. ipython:: python + + df2 = df[:5].copy() + df2.ix[3] + df2.ix[3] = np.arange(len(df2.columns)) + df2 + +Column or row selection can be combined as you would expect with arrays of +labels or even boolean vectors: + +.. ipython:: python + + df.ix[df['A'] > 0, 'B'] + df.ix[date1:date2, 'B'] + df.ix[date1, 'B'] + +Slicing with labels is closely related to the ``truncate`` method which does +precisely ``.ix[start:stop]`` but returns a copy (for legacy reasons). + +Returning a view versus a copy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The rules about when a view on the data is returned are entirely dependent on +NumPy. Whenever an array of labels or a boolean vector are involved in the +indexing operation, the result will be a copy. With single label / scalar +indexing and slicing, e.g. ``df.ix[3:6]`` or ``df.ix[:, 'A']``, a view will be +returned. + +The ``select`` method +~~~~~~~~~~~~~~~~~~~~~ + +Another way to extract slices from an object is with the ``select`` method of +Series, DataFrame, and Panel. This method should be used only when there is no +more direct way. ``select`` takes a function which operates on labels along +``axis`` and returns a boolean. For instance: + +.. ipython:: python + + df.select(lambda x: x == 'A', axis=1) + +The ``lookup`` method +~~~~~~~~~~~~~~~~~~~~~ + +Sometimes you want to extract a set of values given a sequence of row labels +and column labels, and the ``lookup`` method allows for this and returns a +numpy array. For instance, + +.. ipython:: python + + dflookup = DataFrame(np.random.rand(20,4), columns = ['A','B','C','D']) + dflookup.lookup(xrange(0,10,2), ['B','C','A','B','D']) + + +Advanced indexing with integer labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Label-based indexing with integer axis labels is a thorny topic. It has been +discussed heavily on mailing lists and among various members of the scientific +Python community. In pandas, our general viewpoint is that labels matter more +than integer locations. Therefore, advanced indexing with ``.ix`` will always +attempt label-based indexing, before falling back on integer-based indexing. + +Setting values in mixed-type DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.mixed_type_setting: + +Setting values on a mixed-type DataFrame or Panel is supported when using scalar +values, though setting arbitrary vectors is not yet supported: + +.. ipython:: python + + df2 = df[:4] + df2['foo'] = 'bar' + print df2 + df2.ix[2] = np.nan + print df2 + print df2.dtypes + +.. _indexing.class: + +Index objects +------------- + +The pandas Index class and its subclasses can be viewed as implementing an +*ordered set* in addition to providing the support infrastructure necessary for +lookups, data alignment, and reindexing. The easiest way to create one directly +is to pass a list or other sequence to ``Index``: + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b']) + index + 'd' in index + +You can also pass a ``name`` to be stored in the index: + + +.. ipython:: python + + index = Index(['e', 'd', 'a', 'b'], name='something') + index.name + +Starting with pandas 0.5, the name, if set, will be shown in the console +display: + +.. ipython:: python + + index = Index(range(5), name='rows') + columns = Index(['A', 'B', 'C'], name='cols') + df = DataFrame(np.random.randn(5, 3), index=index, columns=columns) + df + df['A'] + + +Set operations on Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_ops: + +The three main operations are ``union (|)``, ``intersection (&)``, and ``diff +(-)``. These can be directly called as instance methods or used via overloaded +operators: + +.. ipython:: python + + a = Index(['c', 'b', 'a']) + b = Index(['c', 'e', 'd']) + a.union(b) + a | b + a & b + a - b + +``isin`` method of Index objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One additional operation is the ``isin`` method that works analogously to the +``Series.isin`` method found :ref:`here `. + +.. _indexing.hierarchical: + +Hierarchical indexing (MultiIndex) +---------------------------------- + +Hierarchical indexing (also referred to as "multi-level" indexing) is brand new +in the pandas 0.4 release. It is very exciting as it opens the door to some +quite sophisticated data analysis and manipulation, especially for working with +higher dimensional data. In essence, it enables you to store and manipulate +data with an arbitrary number of dimensions in lower dimensional data +structures like Series (1d) and DataFrame (2d). + +In this section, we will show what exactly we mean by "hierarchical" indexing +and how it integrates with the all of the pandas indexing functionality +described above and in prior sections. Later, when discussing :ref:`group by +` and :ref:`pivoting and reshaping data `, we'll show +non-trivial applications to illustrate how it aids in structuring data for +analysis. + +.. note:: + + Given that hierarchical indexing is so new to the library, it is definitely + "bleeding-edge" functionality but is certainly suitable for production. But, + there may inevitably be some minor API changes as more use cases are explored + and any weaknesses in the design / implementation are identified. pandas aims + to be "eminently usable" so any feedback about new functionality like this is + extremely helpful. + +Creating a MultiIndex (hierarchical index) object +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``MultiIndex`` object is the hierarchical analogue of the standard +``Index`` object which typically stores the axis labels in pandas objects. You +can think of ``MultiIndex`` an array of tuples where each tuple is unique. A +``MultiIndex`` can be created from a list of arrays (using +``MultiIndex.from_arrays``) or an array of tuples (using +``MultiIndex.from_tuples``). + +.. ipython:: python + + arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = zip(*arrays) + tuples + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + s = Series(randn(8), index=index) + s + +As a convenience, you can pass a list of arrays directly into Series or +DataFrame to construct a MultiIndex automatically: + +.. ipython:: python + + arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), + np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] + s = Series(randn(8), index=arrays) + s + df = DataFrame(randn(8, 4), index=arrays) + df + +All of the ``MultiIndex`` constructors accept a ``names`` argument which stores +string names for the levels themselves. If no names are provided, some +arbitrary ones will be assigned: + +.. ipython:: python + + index.names + +This index can back any axis of a pandas object, and the number of **levels** +of the index is up to you: + +.. ipython:: python + + df = DataFrame(randn(3, 8), index=['A', 'B', 'C'], columns=index) + df + DataFrame(randn(6, 6), index=index[:6], columns=index[:6]) + +We've "sparsified" the higher levels of the indexes to make the console output a +bit easier on the eyes. + +It's worth keeping in mind that there's nothing preventing you from using tuples +as atomic labels on an axis: + +.. ipython:: python + + Series(randn(8), index=tuples) + +The reason that the ``MultiIndex`` matters is that it can allow you to do +grouping, selection, and reshaping operations as we will describe below and in +subsequent areas of the documentation. As you will see in later sections, you +can find yourself working with hierarchically-indexed data without creating a +``MultiIndex`` explicitly yourself. However, when loading data from a file, you +may wish to generate your own ``MultiIndex`` when preparing the data set. + + +Reconstructing the level labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.get_level_values: + +The method ``get_level_values`` will return a vector of the labels for each +location at a particular level: + +.. ipython:: python + + index.get_level_values(0) + index.get_level_values('second') + + +Basic indexing on axis with MultiIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the important features of hierarchical indexing is that you can select +data by a "partial" label identifying a subgroup in the data. **Partial** +selection "drops" levels of the hierarchical index in the result in a completely +analogous way to selecting a column in a regular DataFrame: + +.. ipython:: python + + df['bar'] + df['bar', 'one'] + df['bar']['one'] + s['qux'] + +Data alignment and using ``reindex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Operations between differently-indexed objects having ``MultiIndex`` on the +axes will work as you expect; data alignment will work the same as an Index of +tuples: + +.. ipython:: python + + s + s[:-2] + s + s[::2] + +``reindex`` can be called with another ``MultiIndex`` or even a list or array +of tuples: + +.. ipython:: python + + s.reindex(index[:3]) + s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) + +Advanced indexing with hierarchical index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Syntactically integrating ``MultiIndex`` in advanced indexing with ``.ix`` is a +bit challenging, but we've made every effort to do so. for example the +following works as you would expect: + +.. ipython:: python + + df = df.T + df + df.ix['bar'] + df.ix['bar', 'two'] + +"Partial" slicing also works quite nicely: + +.. ipython:: python + + df.ix['baz':'foo'] + df.ix[('baz', 'two'):('qux', 'one')] + df.ix[('baz', 'two'):'foo'] + +Passing a list of labels or tuples works similar to reindexing: + +.. ipython:: python + + df.ix[[('bar', 'two'), ('qux', 'one')]] + +The following does not work, and it's not clear if it should or not: + +:: + + >>> df.ix[['bar', 'qux']] + +The code for implementing ``.ix`` makes every attempt to "do the right thing" +but as you use it you may uncover corner cases or unintuitive behavior. If you +do find something like this, do not hesitate to report the issue or ask on the +mailing list. + +.. _indexing.xs: + +Cross-section with hierarchical index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``xs`` method of ``DataFrame`` additionally takes a level argument to make +selecting data at a particular level of a MultiIndex easier. + +.. ipython:: python + + df.xs('one', level='second') + +.. _indexing.advanced_reindex: + +Advanced reindexing and alignment with hierarchical index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The parameter ``level`` has been added to the ``reindex`` and ``align`` methods +of pandas objects. This is useful to broadcast values across a level. For +instance: + +.. ipython:: python + + midx = MultiIndex(levels=[['zero', 'one'], ['x','y']], + labels=[[1,1,0,0],[1,0,1,0]]) + df = DataFrame(randn(4,2), index=midx) + print df + df2 = df.mean(level=0) + print df2 + print df2.reindex(df.index, level=0) + df_aligned, df2_aligned = df.align(df2, level=0) + print df_aligned + print df2_aligned + + +The need for sortedness +~~~~~~~~~~~~~~~~~~~~~~~ + +**Caveat emptor**: the present implementation of ``MultiIndex`` requires that +the labels be sorted for some of the slicing / indexing routines to work +correctly. You can think about breaking the axis into unique groups, where at +the hierarchical level of interest, each distinct group shares a label, but no +two have the same label. However, the ``MultiIndex`` does not enforce this: +**you are responsible for ensuring that things are properly sorted**. There is +an important new method ``sortlevel`` to sort an axis within a ``MultiIndex`` +so that its labels are grouped and sorted by the original ordering of the +associated factor at that level. Note that this does not necessarily mean the +labels will be sorted lexicographically! + +.. ipython:: python + + import random; random.shuffle(tuples) + s = Series(randn(8), index=MultiIndex.from_tuples(tuples)) + s + s.sortlevel(0) + s.sortlevel(1) + +.. _indexing.sortlevel_byname: + +Note, you may also pass a level name to ``sortlevel`` if the MultiIndex levels +are named. + +.. ipython:: python + + s.index.names = ['L1', 'L2'] + s.sortlevel(level='L1') + s.sortlevel(level='L2') + +Some indexing will work even if the data are not sorted, but will be rather +inefficient and will also return a copy of the data rather than a view: + +.. ipython:: python + + s['qux'] + s.sortlevel(1)['qux'] + +On higher dimensional objects, you can sort any of the other axes by level if +they have a MultiIndex: + +.. ipython:: python + + df.T.sortlevel(1, axis=1) + +The ``MultiIndex`` object has code to **explicity check the sort depth**. Thus, +if you try to index at a depth at which the index is not sorted, it will raise +an exception. Here is a concrete example to illustrate this: + +.. ipython:: python + + tuples = [('a', 'a'), ('a', 'b'), ('b', 'a'), ('b', 'b')] + idx = MultiIndex.from_tuples(tuples) + idx.lexsort_depth + + reordered = idx[[1, 0, 3, 2]] + reordered.lexsort_depth + + s = Series(randn(4), index=reordered) + s.ix['a':'a'] + +However: + +:: + + >>> s.ix[('a', 'b'):('b', 'a')] + Exception: MultiIndex lexsort depth 1, key was length 2 + +Swapping levels with ``swaplevel`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``swaplevel`` function can switch the order of two levels: + +.. ipython:: python + + df[:5] + df[:5].swaplevel(0, 1, axis=0) + +.. _indexing.reorderlevels: + +Reordering levels with ``reorder_levels`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``reorder_levels`` function generalizes the ``swaplevel`` function, +allowing you to permute the hierarchical index levels in one step: + +.. ipython:: python + + df[:5].reorder_levels([1,0], axis=0) + + +Some gory internal details +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Internally, the ``MultiIndex`` consists of a few things: the **levels**, the +integer **labels**, and the level **names**: + +.. ipython:: python + + index + index.levels + index.labels + index.names + +You can probably guess that the labels determine which unique element is +identified with that location at each layer of the index. It's important to +note that sortedness is determined **solely** from the integer labels and does +not check (or care) whether the levels themselves are sorted. Fortunately, the +constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but +if you compute the levels and labels yourself, please be careful. + +Adding an index to an existing DataFrame +---------------------------------------- + +Occasionally you will load or create a data set into a DataFrame and want to +add an index after you've already done so. There are a couple of different +ways. + +Add an index using DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _indexing.set_index: + +DataFrame has a ``set_index`` method which takes a column name (for a regular +``Index``) or a list of column names (for a ``MultiIndex``), to create a new, +indexed DataFrame: + +.. ipython:: python + :suppress: + + data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo'], + 'b' : ['one', 'two', 'one', 'two'], + 'c' : ['z', 'y', 'x', 'w'], + 'd' : [1., 2., 3, 4]}) + +.. ipython:: python + + data + indexed1 = data.set_index('c') + indexed1 + indexed2 = data.set_index(['a', 'b']) + indexed2 + +Other options in ``set_index`` allow you not drop the index columns or to add +the index in-place (without creating a new object): + +.. ipython:: python + + data.set_index('c', drop=False) + df = data.set_index(['a', 'b'], inplace=True) + data + +Remove / reset the index, ``reset_index`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As a convenience, there is a new function on DataFrame called ``reset_index`` +which transfers the index values into the DataFrame's columns and sets a simple +integer index. This is the inverse operation to ``set_index`` + +.. ipython:: python + + df + df.reset_index() + +The output is more similar to a SQL table or a record array. The names for the +columns derived from the index are the ones stored in the ``names`` attribute. + +``reset_index`` takes an optional parameter ``drop`` which if true simply +discards the index, instead of putting index values in the DataFrame's columns. + +.. note:: + + The ``reset_index`` method used to be called ``delevel`` which is now deprecated. + +Adding an ad hoc index +~~~~~~~~~~~~~~~~~~~~~~ + +If you create an index yourself, you can just assign it to the ``index`` field: + +.. code-block:: python + + df.index = index + +Indexing internal details +------------------------- + +.. note:: + + The following is largely relevant for those actually working on the pandas + codebase. And the source code is still the best place to look at the + specifics of how things are implemented. + +In pandas there are a few objects implemented which can serve as valid +containers for the axis labels: + + - ``Index``: the generic "ordered set" object, an ndarray of object dtype + assuming nothing about its contents. The labels must be hashable (and + likely immutable) and unique. Populates a dict of label to location in + Cython to do :math:`O(1)` lookups. + - ``Int64Index``: a version of ``Index`` highly optimized for 64-bit integer + data, such as time stamps + - ``MultiIndex``: the standard hierarchical index object + - ``date_range``: fixed frequency date range generated from a time rule or + DateOffset. An ndarray of Python datetime objects + +The motivation for having an ``Index`` class in the first place was to enable +different implementations of indexing. This means that it's possible for you, +the user, to implement a custom ``Index`` subclass that may be better suited to +a particular application than the ones provided in pandas. For example, we plan +to add a more efficient datetime index which leverages the new +``numpy.datetime64`` dtype in the relatively near future. + +From an internal implementation point of view, the relevant methods that an +``Index`` must define are one or more of the following (depending on how +incompatible the new object internals are with the ``Index`` functions): + + - ``get_loc``: returns an "indexer" (an integer, or in some cases a + slice object) for a label + - ``slice_locs``: returns the "range" to slice between two labels + - ``get_indexer``: Computes the indexing vector for reindexing / data + alignment purposes. See the source / docstrings for more on this + - ``reindex``: Does any pre-conversion of the input index then calls + ``get_indexer`` + - ``union``, ``intersection``: computes the union or intersection of two + Index objects + - ``insert``: Inserts a new label into an Index, yielding a new object + - ``delete``: Delete a label, yielding a new object + - ``drop``: Deletes a set of labels + - ``take``: Analogous to ndarray.take diff --git a/doc/source/install.rst b/doc/source/install.rst new file mode 100644 index 00000000..67546cad --- /dev/null +++ b/doc/source/install.rst @@ -0,0 +1,152 @@ +.. _install: + +.. currentmodule:: pandas + +************ +Installation +************ + +You have the option to install an `official release +`__ or to build the `development version +`__. If you choose to install from source and +are running Windows, you will have to ensure that you have a compatible C +compiler (MinGW or Visual Studio) installed. `How-to install MinGW on Windows +`__ + +Python version support +~~~~~~~~~~~~~~~~~~~~~~ + +Officially Python 2.5 to 2.7 and Python 3.1+, although Python 3 support is less +well tested. Python 2.4 support is being phased out since the userbase has +shrunk significantly. Continuing Python 2.4 support will require either monetary +development support or someone contributing to the project to maintain +compatibility. + + +Binary installers +~~~~~~~~~~~~~~~~~ + +.. _all-platforms: + +All platforms +_____________ + +Stable installers available on `PyPI `__ + +Preliminary builds and installers on the `Pandas download page `__ . + +Overview +___________ + + + +.. csv-table:: + :header: "Platform", "Distribution", "Status", "Download / Repository Link", "Install method" + :widths: 10, 10, 10, 20, 50 + + + Windows, all, stable, :ref:`all-platforms`, ``pip install pandas`` + Mac, all, stable, :ref:`all-platforms`, ``pip install pandas`` + Linux, Debian, stable, `official Debian repository `_ , ``sudo apt-get install python-pandas`` + Linux, Debian, unstable (latest packages), `NeuroDebian `_ , ``sudo apt-get install python-pandas`` + Linux, Ubuntu, stable, `official Ubuntu repository `_ , ``sudo apt-get install python-pandas`` + Linux, Ubuntu, unstable (daily builds), `PythonXY PPA `_; activate by: ``sudo add-apt-repository ppa:pythonxy/pythonxy-devel && sudo apt-get update``, ``sudo apt-get install python-pandas`` + Linux, OpenSuse & Fedora, stable, `OpenSuse Repository `_ , ``zypper in python-pandas`` + + + + + + + + + + +Dependencies +~~~~~~~~~~~~ + + * `NumPy `__: 1.6.1 or higher + * `python-dateutil `__ 1.5 + +Optional dependencies +~~~~~~~~~~~~~~~~~~~~~ + + * `SciPy `__: miscellaneous statistical functions + * `PyTables `__: necessary for HDF5-based storage + * `matplotlib `__: for plotting + * `scikits.statsmodels `__ + * Needed for parts of :mod:`pandas.stats` + * `pytz `__ + * Needed for time zone support with ``date_range`` + +.. note:: + + Without the optional dependencies, many useful features will not + work. Hence, it is highly recommended that you install these. A packaged + distribution like the `Enthought Python Distribution + `__ may be worth considering. + +Installing from source +~~~~~~~~~~~~~~~~~~~~~~ +.. note:: + + Installing from the git repository requires a recent installation of `Cython + `__ as the cythonized C sources are no longer checked + into source control. Released source distributions will contain the built C + files. I recommend installing the latest Cython via ``easy_install -U + Cython`` + +The source code is hosted at http://github.com/pydata/pandas, it can be checked +out using git and compiled / installed like so: + +:: + + git clone git://github.com/pydata/pandas.git + cd pandas + python setup.py install + +On Windows, I suggest installing the MinGW compiler suite following the +directions linked to above. Once configured property, run the following on the +command line: + +:: + + python setup.py build --compiler=mingw32 + python setup.py install + +Note that you will not be able to import pandas if you open an interpreter in +the source directory unless you build the C extensions in place: + +:: + + python setup.py build_ext --inplace + + +Running the test suite +~~~~~~~~~~~~~~~~~~~~~~ + +pandas is equipped with an exhaustive set of unit tests covering about 97% of +the codebase as of this writing. To run it on your machine to verify that +everything is working (and you have all of the dependencies, soft and hard, +installed), make sure you have `nose +`__ and run: + +:: + + $ nosetests pandas + .......................................................................... + .......................S.................................................. + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .......................................................................... + .................S........................................................ + .... + ---------------------------------------------------------------------- + Ran 818 tests in 21.631s + + OK (SKIP=2) diff --git a/doc/source/io.rst b/doc/source/io.rst new file mode 100644 index 00000000..8c67b7d9 --- /dev/null +++ b/doc/source/io.rst @@ -0,0 +1,783 @@ +.. _io: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import os + import csv + from StringIO import StringIO + + import numpy as np + np.random.seed(123456) + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + + import matplotlib.pyplot as plt + plt.close('all') + + from pandas import * + import pandas.util.testing as tm + clipdf = DataFrame({'A':[1,2,3],'B':[4,5,6],'C':['p','q','r']}, + index=['x','y','z']) + +******************************* +IO Tools (Text, CSV, HDF5, ...) +******************************* + +Clipboard +--------- + +.. _io.clipboard: + +A handy way to grab data is to use the ``read_clipboard`` method, which takes +the contents of the clipboard buffer and passes them to the ``read_table`` +method described in the next section. For instance, you can copy the following +text to the clipboard (CTRL-C on many operating systems): + +.. code-block:: python + + A B C + x 1 4 p + y 2 5 q + z 3 6 r + +And then import the data directly to a DataFrame by calling: + +.. code-block:: python + + clipdf = read_clipboard(sep='\s*') + +.. ipython:: python + + clipdf + +.. _io.read_csv_table: + +CSV & Text files +---------------- + +The two workhorse functions for reading text files (a.k.a. flat files) are +:func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table`. +They both use the same parsing code to intelligently convert tabular +data into a DataFrame object. They can take a number of arguments: + + - ``filepath_or_buffer``: Either a string path to a file, or any object with a + ``read`` method (such as an open file or ``StringIO``). + - ``sep`` or ``delimiter``: A delimiter / separator to split fields + on. `read_csv` is capable of inferring the delimiter automatically in some + cases by "sniffing." The separator may be specified as a regular + expression; for instance you may use '\s*' to indicate arbitrary + whitespace. + - ``dialect``: string or csv.Dialect instance to expose more ways to specify + the file format + - ``header``: row number to use as the column names, and the start of the data. + Defaults to 0 (first row); specify None if there is no header row. + - ``skiprows``: A collection of numbers for rows in the file to skip. Can + also be an integer to skip the first ``n`` rows + - ``index_col``: column number, column name, or list of column numbers/names, + to use as the ``index`` (row labels) of the resulting DataFrame. By default, + it will number the rows without using any column, unless there is one more + data column than there are headers, in which case the first column is taken + as the index. + - ``names``: List of column names to use. If passed, header will be + implicitly set to None. + - ``na_values``: optional list of strings to recognize as NaN (missing + values), in addition to a default set. If you pass an empty list or an + empty list for a particular column, no values (including empty strings) + will be considered NA + - ``parse_dates``: if True then index will be parsed as dates + (False by default). You can specify more complicated options to parse + a subset of columns or a combination of columns into a single date column + (list of ints or names, list of lists, or dict) + [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column + [[1, 3]] -> combine columns 1 and 3 and parse as a single date column + {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' + - ``keep_date_col``: if True, then date component columns passed into + ``parse_dates`` will be retained in the output (False by default). + - ``date_parser``: function to use to parse strings into datetime + objects. If ``parse_dates`` is True, it defaults to the very robust + ``dateutil.parser``. Specifying this implicitly sets ``parse_dates`` as True. + You can also use functions from community supported date converters from + date_converters.py + - ``dayfirst``: if True then uses the DD/MM international/European date format + (This is False by default) + - ``thousands``: sepcifies the thousands separator. If not None, then parser + will try to look for it in the output and parse relevant data to integers. + Because it has to essentially scan through the data again, this causes a + significant performance hit so only use if necessary. + - ``comment``: denotes the start of a comment and ignores the rest of the line. + Currently line commenting is not supported. + - ``nrows``: Number of rows to read out of the file. Useful to only read a + small portion of a large file + - ``iterator``: If True, return a ``TextParser`` to enable reading a file + into memory piece by piece + - ``chunksize``: An number of rows to be used to "chunk" a file into + pieces. Will cause an ``TextParser`` object to be returned. More on this + below in the section on :ref:`iterating and chunking ` + - ``skip_footer``: number of lines to skip at bottom of file (default 0) + - ``converters``: a dictionary of functions for converting values in certain + columns, where keys are either integers or column labels + - ``encoding``: a string representing the encoding to use if the contents are + non-ascii + - ``verbose``: show number of NA values inserted in non-numeric columns + - ``squeeze``: if True then output with only one column is turned into Series + +.. ipython:: python + :suppress: + + f = open('foo.csv','w') + f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a typical CSV file containing, in this case, some time series data: + +.. ipython:: python + + print open('foo.csv').read() + +The default for `read_csv` is to create a DataFrame with simple numbered rows: + +.. ipython:: python + + read_csv('foo.csv') + +In the case of indexed data, you can pass the column number or column name you +wish to use as the index: + +.. ipython:: python + + read_csv('foo.csv', index_col=0) + +.. ipython:: python + + read_csv('foo.csv', index_col='date') + +You can also use a list of columns to create a hierarchical index: + +.. ipython:: python + + read_csv('foo.csv', index_col=[0, 'A']) + +.. _io.dialect: + +The ``dialect`` keyword gives greater flexibility in specifying the file format. +By default it uses the Excel dialect but you can specify either the dialect name +or a `csv.Dialect `_ instance. + +.. ipython:: python + :suppress: + + data = ('label1,label2,label3\n' + 'index1,"a,c,e\n' + 'index2,b,d,f') + +Suppose you had data with unenclosed quotes: + +.. ipython:: python + + print data + +By default, ``read_csv`` uses the Excel dialect and treats the double quote as +the quote character, which causes it to fail when it finds a newline before it +finds the closing double quote. + +We can get around this using ``dialect`` + +.. ipython:: python + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + read_csv(StringIO(data), dialect=dia) + +The parsers make every attempt to "do the right thing" and not be very +fragile. Type inference is a pretty big deal. So if a column can be coerced to +integer dtype without altering the contents, it will do so. Any non-numeric +columns will come through as object dtype as with the rest of pandas objects. + +.. _io.parse_dates: + +Specifying Date Columns +~~~~~~~~~~~~~~~~~~~~~~~ + +To better facilitate working with datetime data, :func:`~pandas.io.parsers.read_csv` and :func:`~pandas.io.parsers.read_table` +uses the keyword arguments ``parse_dates`` and ``date_parser`` to allow users +to specify a variety of columns and date/time formats to turn the input text +data into ``datetime`` objects. + +The simplest case is to just pass in ``parse_dates=True``: + +.. ipython:: python + + # Use a column as an index, and parse it as dates. + df = read_csv('foo.csv', index_col=0, parse_dates=True) + df + + # These are python datetime objects + df.index + +.. ipython:: python + :suppress: + + os.remove('foo.csv') + +It is often the case that we may want to store date and time data separately, +or store various date fields separately. the ``parse_dates`` keyword can be +used to specify a combination of columns to parse the dates and/or times from. + +You can specify a list of column lists to ``parse_dates``, the resulting date +columns will be prepended to the output (so as to not affect the existing column +order) and the new column names will be the concatenation of the component +column names: + +.. ipython:: python + :suppress: + + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print open('tmp.csv').read() + df = read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]]) + df + +By default the parser removes the component date columns, but you can choose +to retain them via the ``keep_date_col`` keyword: + +.. ipython:: python + + df = read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]], + keep_date_col=True) + df + +Note that if you wish to combine multiple columns into a single date column, a +nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that +the second and third columns should each be parsed as separate date columns +while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a +single column. + +You can also use a dict to specify custom name columns: + +.. ipython:: python + + date_spec = {'nominal': [1, 2], 'actual': [1, 3]} + df = read_csv('tmp.csv', header=None, parse_dates=date_spec) + df + +Date Parsing Functions +~~~~~~~~~~~~~~~~~~~~~~ +Finally, the parser allows you can specify a custom ``date_parser`` function to +take full advantage of the flexiblity of the date parsing API: + +.. ipython:: python + + import pandas.io.date_converters as conv + df = read_csv('tmp.csv', header=None, parse_dates=date_spec, + date_parser=conv.parse_date_time) + df + +You can explore the date parsing functionality in ``date_converters.py`` and +add your own. We would love to turn this module into a community supported set +of date/time parsers. To get you started, ``date_converters.py`` contains +functions to parse dual date and time columns, year/month/day columns, +and year/month/day/hour/minute/second columns. It also contains a +``generic_parser`` function so you can curry it with a function that deals with +a single date rather than the entire array. + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +.. _io.dayfirst: + +International Date Formats +~~~~~~~~~~~~~~~~~~~~~~~~~~ +While US date formats tend to be MM/DD/YYYY, many international formats use +DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: + +.. ipython:: python + :suppress: + + data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print open('tmp.csv').read() + + read_csv('tmp.csv', parse_dates=[0]) + + read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) + +.. _io.thousands: + +Thousand Separators +~~~~~~~~~~~~~~~~~~~ +For large integers that have been written with a thousands separator, you can +set the ``thousands`` keyword to ``True`` so that integers will be parsed +correctly: + +.. ipython:: python + :suppress: + + data = ("ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +By default, integers with a thousands separator will be parsed as strings + +.. ipython:: python + + print open('tmp.csv').read() + df = read_csv('tmp.csv', sep='|') + df + + df.level.dtype + +The ``thousands`` keyword allows integers to be parsed correctly + +.. ipython:: python + + print open('tmp.csv').read() + df = read_csv('tmp.csv', sep='|', thousands=',') + df + + df.level.dtype + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +.. _io.comments: + +Comments +~~~~~~~~ +Sometimes comments or meta data may be included in a file: + +.. ipython:: python + :suppress: + + data = ("ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print open('tmp.csv').read() + +By default, the parse includes the comments in the output: + +.. ipython:: python + + df = read_csv('tmp.csv') + df + +We can suppress the comments using the ``comment`` keyword: + +.. ipython:: python + + df = read_csv('tmp.csv', comment='#') + df + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +Returning Series +~~~~~~~~~~~~~~~~ + +Using the ``squeeze`` keyword, the parser will return output with a single column +as a ``Series``: + +.. ipython:: python + :suppress: + + data = ("level\n" + "Patient1,123000\n" + "Patient2,23000\n" + "Patient3,1234018") + + with open('tmp.csv', 'w') as fh: + fh.write(data) + +.. ipython:: python + + print open('tmp.csv').read() + + output = read_csv('tmp.csv', squeeze=True) + output + + type(output) + +.. ipython:: python + :suppress: + + os.remove('tmp.csv') + +.. _io.fwf: + +Files with Fixed Width Columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +While `read_csv` reads delimited data, the :func:`~pandas.io.parsers.read_fwf` +function works with data files that have known and fixed column widths. +The function parameters to `read_fwf` are largely the same as `read_csv` with +two extra parameters: + + - ``colspecs``: a list of pairs (tuples), giving the extents of the + fixed-width fields of each line as half-open intervals [from, to[ + - ``widths``: a list of field widths, which can be used instead of + ``colspecs`` if the intervals are contiguous + +.. ipython:: python + :suppress: + + f = open('bar.csv', 'w') + data1 = ("id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3") + f.write(data1) + f.close() + +Consider a typical fixed-width data file: + +.. ipython:: python + + print open('bar.csv').read() + +In order to parse this file into a DataFrame, we simply need to supply the +column specifications to the `read_fwf` function along with the file name: + +.. ipython:: python + + #Column specifications are a list of half-intervals + colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] + df = read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0) + df + +Note how the parser automatically picks column names X. when +``header=None`` argument is specified. Alternatively, you can supply just the +column widths for contiguous columns: + +.. ipython:: python + + #Widths are a list of integers + widths = [6, 14, 13, 10] + df = read_fwf('bar.csv', widths=widths, header=None) + df + +The parser will take care of extra white spaces around the columns +so it's ok to have extra separation between the columns in the file. + +.. ipython:: python + :suppress: + + os.remove('bar.csv') + +Files with an "implicit" index column +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + f = open('foo.csv', 'w') + f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f.close() + +Consider a file with one less entry in the header than the number of data +column: + +.. ipython:: python + + print open('foo.csv').read() + +In this special case, ``read_csv`` assumes that the first column is to be used +as the index of the DataFrame: + +.. ipython:: python + + read_csv('foo.csv') + +Note that the dates weren't automatically parsed. In that case you would need +to do as before: + +.. ipython:: python + + df = read_csv('foo.csv', parse_dates=True) + df.index + +.. ipython:: python + :suppress: + + os.remove('foo.csv') + + +Reading DataFrame objects with ``MultiIndex`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.csv_multiindex: + +Suppose you have data indexed by two columns: + +.. ipython:: python + + print open('data/mindex_ex.csv').read() + +The ``index_col`` argument to ``read_csv`` and ``read_table`` can take a list of +column numbers to turn multiple columns into a ``MultiIndex``: + +.. ipython:: python + + df = read_csv("data/mindex_ex.csv", index_col=[0,1]) + df + df.ix[1978] + +.. _io.sniff: + +Automatically "sniffing" the delimiter +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``read_csv`` is capable of inferring delimited (not necessarily +comma-separated) files. YMMV, as pandas uses the Sniffer_ class of the csv +module. + +.. ipython:: python + :suppress: + + df = DataFrame(np.random.randn(10, 4)) + df.to_csv('tmp.sv', sep='|') + df.to_csv('tmp2.sv', sep=':') + +.. ipython:: python + + print open('tmp2.sv').read() + read_csv('tmp2.sv') + +.. _Sniffer: http://docs.python.org/library/csv.html#csv.Sniffer + +.. _io.chunking: + +Iterating through files chunk by chunk +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Suppose you wish to iterate through a (potentially very large) file lazily +rather than reading the entire file into memory, such as the following: + + +.. ipython:: python + + print open('tmp.sv').read() + table = read_table('tmp.sv', sep='|') + table + + +By specifiying a ``chunksize`` to ``read_csv`` or ``read_table``, the return +value will be an iterable object of type ``TextParser``: + +.. ipython:: python + + reader = read_table('tmp.sv', sep='|', chunksize=4) + reader + + for chunk in reader: + print chunk + + +Specifying ``iterator=True`` will also return the ``TextParser`` object: + +.. ipython:: python + + reader = read_table('tmp.sv', sep='|', iterator=True) + reader.get_chunk(5) + +.. ipython:: python + :suppress: + + os.remove('tmp.sv') + os.remove('tmp2.sv') + +Writing to CSV format +~~~~~~~~~~~~~~~~~~~~~ + +.. _io.store_in_csv: + +The Series and DataFrame objects have an instance method ``to_csv`` which +allows storing the contents of the object as a comma-separated-values file. The +function takes a number of arguments. Only the first is required. + + - ``path``: A string path to the file to write + ``nanRep``: A string representation of a missing value (default '') + - ``cols``: Columns to write (default None) + - ``header``: Whether to write out the column names (default True) + - ``index``: whether to write row (index) names (default True) + - ``index_label``: Column label(s) for index column(s) if desired. If None + (default), and `header` and `index` are True, then the index names are + used. (A sequence should be given if the DataFrame uses MultiIndex). + - ``mode`` : Python write mode, default 'w' + - ``sep`` : Field delimiter for the output file (default "'") + - ``encoding``: a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + +Writing a formatted string +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.formatting: + +The DataFrame object has an instance method ``to_string`` which allows control +over the string representation of the object. All arguments are optional: + + - ``buf`` default None, for example a StringIO object + - ``columns`` default None, which columns to write + - ``col_space`` default None, number of spaces to write between columns + - ``na_rep`` default ``NaN``, representation of NA value + - ``formatters`` default None, a dictionary (by column) of functions each of + which takes a single argument and returns a formatted string + - ``float_format`` default None, a function which takes a single (float) + argument and returns a formatted string; to be applied to floats in the + DataFrame. + - ``sparsify`` default True, set to False for a DataFrame with a hierarchical + index to print every multiindex key at each row. + - ``index_names`` default True, will print the names of the indices + - ``index`` default True, will print the index (ie, row labels) + - ``header`` default True, will print the column labels + - ``justify`` default ``left``, will print column headers left- or + right-justified + +The Series object also has a ``to_string`` method, but with only the ``buf``, +``na_rep``, ``float_format`` arguments. There is also a ``length`` argument +which, if set to ``True``, will additionally output the length of the Series. + + +Writing to HTML format +~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.html: + +DataFrame object has an instance method ``to_html`` which renders the contents +of the DataFrame as an html table. The function arguments are as in the method +``to_string`` described above. + + +Excel files +---------------- + +The ``ExcelFile`` class can read an Excel 2003 file using the ``xlrd`` Python +module and use the same parsing code as the above to convert tabular data into +a DataFrame. To use it, create the ``ExcelFile`` object: + +.. code-block:: python + + xls = ExcelFile('path_to_file.xls') + +Then use the ``parse`` instance method with a sheetname, then use the same +additional arguments as the parsers above: + +.. code-block:: python + + xls.parse('Sheet1', index_col=None, na_values=['NA']) + +To read sheets from an Excel 2007 file, you can pass a filename with a ``.xlsx`` +extension, in which case the ``openpyxl`` module will be used to read the file. + +To write a DataFrame object to a sheet of an Excel file, you can use the +``to_excel`` instance method. The arguments are largely the same as ``to_csv`` +described above, the first argument being the name of the excel file, and the +optional second argument the name of the sheet to which the DataFrame should be +written. For example: + +.. code-block:: python + + df.to_excel('path_to_file.xlsx', sheet_name='sheet1') + +Files with a ``.xls`` extension will be written using ``xlwt`` and those with +a ``.xlsx`` extension will be written using ``openpyxl``. +The Panel class also has a ``to_excel`` instance method, +which writes each DataFrame in the Panel to a separate sheet. + +In order to write separate DataFrames to separate sheets in a single Excel file, +one can use the ExcelWriter class, as in the following example: + +.. code-block:: python + + writer = ExcelWriter('path_to_file.xlsx') + df1.to_excel(writer, sheet_name='sheet1') + df2.to_excel(writer, sheet_name='sheet2') + writer.save() + + +HDF5 (PyTables) +--------------- + +``HDFStore`` is a dict-like object which reads and writes pandas to the high +performance HDF5 format using the excellent `PyTables +`__ library. + +.. ipython:: python + :suppress: + :okexcept: + + os.remove('store.h5') + +.. ipython:: python + + store = HDFStore('store.h5') + print store + +Objects can be written to the file just like adding key-value pairs to a dict: + +.. ipython:: python + + index = date_range('1/1/2000', periods=8) + s = Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) + df = DataFrame(randn(8, 3), index=index, + columns=['A', 'B', 'C']) + wp = Panel(randn(2, 5, 4), items=['Item1', 'Item2'], + major_axis=date_range('1/1/2000', periods=5), + minor_axis=['A', 'B', 'C', 'D']) + + store['s'] = s + store['df'] = df + store['wp'] = wp + store + +In a current or later Python session, you can retrieve stored objects: + +.. ipython:: python + + store['df'] + +.. ipython:: python + :suppress: + + store.close() + import os + os.remove('store.h5') + + +.. Storing in Table format +.. ~~~~~~~~~~~~~~~~~~~~~~~ + +.. Querying objects stored in Table format +.. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/merging.rst b/doc/source/merging.rst new file mode 100644 index 00000000..a8ec112f --- /dev/null +++ b/doc/source/merging.rst @@ -0,0 +1,613 @@ +.. currentmodule:: pandas +.. _merging: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from numpy import nan + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + +**************************** +Merge, join, and concatenate +**************************** + +pandas provides various facilities for easily combining together Series, +DataFrame, and Panel objects with various kinds of set logic for the indexes +and relational algebra functionality in the case of join / merge-type +operations. + +.. _merging.concat: + +Concatenating objects +--------------------- + +The ``concat`` function (in the main pandas namespace) does all of the heavy +lifting of performing concatenation operations along an axis while performing +optional set logic (union or intersection) of the indexes (if any) on the other +axes. Note that I say "if any" because there is only a single possible axis of +concatenation for Series. + +Before diving into all of the details of ``concat`` and what it can do, here is +a simple example: + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df[:3], df[3:7], df[7:]] + + concatenated = concat(pieces) + concatenated + +Like its sibling function on ndarrays, ``numpy.concatenate``, ``pandas.concat`` +takes a list or dict of homogeneously-typed objects and concatenates them with +some configurable handling of "what to do with the other axes": + +:: + + concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, + keys=None, levels=None, names=None, verify_integrity=False) + +- ``objs``: list or dict of Series, DataFrame, or Panel objects. If a dict is + passed, the sorted keys will be used as the `keys` argument, unless it is + passed, in which case the values will be selected (see below) +- ``axis``: {0, 1, ...}, default 0. The axis to concatenate along +- ``join``: {'inner', 'outer'}, default 'outer'. How to handle indexes on + other axis(es). Outer for union and inner for intersection +- ``join_axes``: list of Index objects. Specific indexes to use for the other + n - 1 axes instead of performing inner/outer set logic +- ``keys``: sequence, default None. Construct hierarchical index using the + passed keys as the outermost level If multiple levels passed, should + contain tuples. +- ``levels`` : list of sequences, default None. If keys passed, specific + levels to use for the resulting MultiIndex. Otherwise they will be inferred + from the keys +- ``names``: list, default None. Names for the levels in the resulting + hierarchical index +- ``verify_integrity``: boolean, default False. Check whether the new + concatenated axis contains duplicates. This can be very expensive relative + to the actual data concatenation +- ``ignore_index`` : boolean, default False. If True, do not use the index + values on the concatenation axis. The resulting axis will be labeled 0, ..., + n - 1. This is useful if you are concatenating objects where the + concatenation axis does not have meaningful indexing information. + +Without a little bit of context and example many of these arguments don't make +much sense. Let's take the above example. Suppose we wanted to associate +specific keys with each of the pieces of the chopped up DataFrame. We can do +this using the ``keys`` argument: + +.. ipython:: python + + concatenated = concat(pieces, keys=['first', 'second', 'third']) + concatenated + +As you can see (if you've read the rest of the documentation), the resulting +object's index has a :ref:`hierarchical index `. This +means that we can now do stuff like select out each chunk by key: + +.. ipython:: python + + concatenated.ix['second'] + +It's not a stretch to see how this can be very useful. More detail on this +functionality below. + +Set logic on the other axes +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When gluing together multiple DataFrames (or Panels or...), for example, you +have a choice of how to handle the other axes (other than the one being +concatenated). This can be done in three ways: + +- Take the (sorted) union of them all, ``join='outer'``. This is the default + option as it results in zero information loss. +- Take the intersection, ``join='inner'``. +- Use a specific index (in the case of DataFrame) or indexes (in the case of + Panel or future higher dimensional objects), i.e. the ``join_axes`` argument + +Here is a example of each of these methods. First, the default ``join='outer'`` +behavior: + +.. ipython:: python + + from pandas.util.testing import rands + df = DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'], + index=[rands(5) for _ in xrange(10)]) + df + + concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], + df.ix[-7:, ['d']]], axis=1) + +Note that the row indexes have been unioned and sorted. Here is the same thing +with ``join='inner'``: + +.. ipython:: python + + concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], + df.ix[-7:, ['d']]], axis=1, join='inner') + +Lastly, suppose we just wanted to reuse the *exact index* from the original +DataFrame: + +.. ipython:: python + + concat([df.ix[:7, ['a', 'b']], df.ix[2:-2, ['c']], + df.ix[-7:, ['d']]], axis=1, join_axes=[df.index]) + +.. _merging.concatenation: + +Concatenating using ``append`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A useful shortcut to ``concat`` are the ``append`` instance methods on Series +and DataFrame. These methods actually predated ``concat``. They concatenate +along ``axis=0``, namely the index: + +.. ipython:: python + + s = Series(randn(10), index=np.arange(10)) + s1 = s[:5] # note we're slicing with labels here, so 5 is included + s2 = s[6:] + s1.append(s2) + +In the case of DataFrame, the indexes must be disjoint but the columns do not +need to be: + +.. ipython:: python + + df = DataFrame(randn(6, 4), index=date_range('1/1/2000', periods=6), + columns=['A', 'B', 'C', 'D']) + df1 = df.ix[:3] + df2 = df.ix[3:, :3] + df1 + df2 + df1.append(df2) + +``append`` may take multiple objects to concatenate: + +.. ipython:: python + + df1 = df.ix[:2] + df2 = df.ix[2:4] + df3 = df.ix[4:] + df1.append([df2,df3]) + +.. note:: + + Unlike `list.append` method, which appends to the original list and + returns nothing, ``append`` here **does not** modify ``df1`` and + returns its copy with ``df2`` appended. + +.. _merging.ignore_index: + +Ignoring indexes on the concatenation axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For DataFrames which don't have a meaningful index, you may wish to append them +and ignore the fact that they may have overlapping indexes: + +.. ipython:: python + + df1 = DataFrame(randn(6, 4), columns=['A', 'B', 'C', 'D']) + df2 = DataFrame(randn(3, 4), columns=['A', 'B', 'C', 'D']) + + df1 + df2 + +To do this, use the ``ignore_index`` argument: + +.. ipython:: python + + concat([df1, df2], ignore_index=True) + +This is also a valid argument to ``DataFrame.append``: + +.. ipython:: python + + df1.append(df2, ignore_index=True) + + +More concatenating with group keys +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's consider a variation on the first example presented: + +.. ipython:: python + + df = DataFrame(np.random.randn(10, 4)) + df + + # break it into pieces + pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + + result = concat(pieces, axis=1, keys=['one', 'two', 'three']) + result + +You can also pass a dict to ``concat`` in which case the dict keys will be used +for the ``keys`` argument (unless other keys are specified): + +.. ipython:: python + + pieces = {'one': df.ix[:, [0, 1]], + 'two': df.ix[:, [2]], + 'three': df.ix[:, [3]]} + concat(pieces, axis=1) + concat(pieces, keys=['three', 'two']) + +The MultiIndex created has levels that are constructed from the passed keys and +the columns of the DataFrame pieces: + +.. ipython:: python + + result.columns.levels + +If you wish to specify other levels (as will occasionally be the case), you can +do so using the ``levels`` argument: + +.. ipython:: python + + result = concat(pieces, axis=1, keys=['one', 'two', 'three'], + levels=[['three', 'two', 'one', 'zero']], + names=['group_key']) + result + result.columns.levels + +Yes, this is fairly esoteric, but is actually necessary for implementing things +like GroupBy where the order of a categorical variable is meaningful. + +.. _merging.append.row: + +Appending rows to a DataFrame +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While not especially efficient (since a new object must be created), you can +append a single row to a DataFrame by passing a Series or dict to ``append``, +which returns a new DataFrame as above. + +.. ipython:: python + + df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df + s = df.xs(3) + df.append(s, ignore_index=True) + +You should use ``ignore_index`` with this method to instruct DataFrame to +discard its index. If you wish to preserve the index, you should construct an +appropriately-indexed DataFrame and append or concatenate those objects. + +You can also pass a list of dicts or Series: + +.. ipython:: python + + df = DataFrame(np.random.randn(5, 4), + columns=['foo', 'bar', 'baz', 'qux']) + dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, + {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] + result = df.append(dicts, ignore_index=True) + result + +.. _merging.join: + +Database-style DataFrame joining/merging +---------------------------------------- + +pandas has full-featured, **high performance** in-memory join operations +idiomatically very similar to relational databases like SQL. These methods +perform significantly better (in some cases well over an order of magnitude +better) than other open source implementations (like ``base::merge.data.frame`` +in R). The reason for this is careful algorithmic design and internal layout of +the data in DataFrame. + +pandas provides a single function, ``merge``, as the entry point for all +standard database join operations between DataFrame objects: + +:: + + merge(left, right, how='left', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('.x', '.y'), copy=True) + +Here's a description of what each argument is for: + + - ``left``: A DataFrame object + - ``right``: Another DataFrame object + - ``on``: Columns (names) to join on. Must be found in both the left and + right DataFrame objects. If not passed and ``left_index`` and + ``right_index`` are ``False``, the intersection of the columns in the + DataFrames will be inferred to be the join keys + - ``left_on``: Columns from the left DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame + - ``right_on``: Columns from the right DataFrame to use as keys. Can either be + column names or arrays with length equal to the length of the DataFrame + - ``left_index``: If ``True``, use the index (row labels) from the left + DataFrame as its join key(s). In the case of a DataFrame with a MultiIndex + (hierarchical), the number of levels must match the number of join keys + from the right DataFrame + - ``right_index``: Same usage as ``left_index`` for the right DataFrame + - ``how``: One of ``'left'``, ``'right'``, ``'outer'``, ``'inner'``. Defaults + to ``inner``. See below for more detailed description of each method + - ``sort``: Sort the result DataFrame by the join keys in lexicographical + order. Defaults to ``True``, setting to ``False`` will improve performance + substantially in many cases + - ``suffixes``: A tuple of string suffixes to apply to overlapping + columns. Defaults to ``('.x', '.y')``. + - ``copy``: Always copy data (default ``True``) from the passed DataFrame + objects, even when reindexing is not necessary. Cannot be avoided in many + cases but may improve performance / memory usage. The cases where copying + can be avoided are somewhat pathological but this option is provided + nonetheless. + +``merge`` is a function in the pandas namespace, and it is also available as a +DataFrame instance method, with the calling DataFrame being implicitly +considered the left object in the join. + +The related ``DataFrame.join`` method, uses ``merge`` internally for the +index-on-index and index-on-column(s) joins, but *joins on indexes* by default +rather than trying to join on common columns (the default behavior for +``merge``). If you are joining on index, you may wish to use ``DataFrame.join`` +to save yourself some typing. + +Brief primer on merge methods (relational algebra) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Experienced users of relational databases like SQL will be familiar with the +terminology used to describe join operations between two SQL-table like +structures (DataFrame objects). There are several cases to consider which are +very important to understand: + + - **one-to-one** joins: for example when joining two DataFrame objects on + their indexes (which must contain unique values) + - **many-to-one** joins: for example when joining an index (unique) to one or + more columns in a DataFrame + - **many-to-many** joins: joining columns on columns. + +.. note:: + + When joining columns on columns (potentially a many-to-many join), any + indexes on the passed DataFrame objects **will be discarded**. + + +It is worth spending some time understanding the result of the **many-to-many** +join case. In SQL / standard relational algebra, if a key combination appears +more than once in both tables, the resulting table will have the **Cartesian +product** of the associated data. Here is a very basic example with one unique +key combination: + +.. ipython:: python + + left = DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) + right = DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left + right + merge(left, right, on='key') + +Here is a more complicated example with multiple join keys: + +.. ipython:: python + + left = DataFrame({'key1': ['foo', 'foo', 'bar'], + 'key2': ['one', 'two', 'one'], + 'lval': [1, 2, 3]}) + right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], + 'key2': ['one', 'one', 'one', 'two'], + 'rval': [4, 5, 6, 7]}) + merge(left, right, how='outer') + merge(left, right, how='inner') + +The ``how`` argument to ``merge`` specifies how to determine which keys are to +be included in the resulting table. If a key combination **does not appear** in +either the left or right tables, the values in the joined table will be +``NA``. Here is a summary of the ``how`` options and their SQL equivalent names: + +.. csv-table:: + :header: "Merge method", "SQL Join Name", "Description" + :widths: 20, 20, 60 + + ``left``, ``LEFT OUTER JOIN``, Use keys from left frame only + ``right``, ``RIGHT OUTER JOIN``, Use keys from right frame only + ``outer``, ``FULL OUTER JOIN``, Use union of keys from both frames + ``inner``, ``INNER JOIN``, Use intersection of keys from both frames + +Note that if using the index from either the left or right DataFrame (or both) +using the ``left_index`` / ``right_index`` options, the join operation is no +longer a many-to-many join by construction, as the index values are necessarily +unique. There will be some examples of this below. + +.. _merging.join.index: + +Joining on index +~~~~~~~~~~~~~~~~ + +``DataFrame.join`` is a convenient method for combining the columns of two +potentially differently-indexed DataFrames into a single result DataFrame. Here +is a very basic example: + +.. ipython:: python + + df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df1 = df.ix[1:, ['A', 'B']] + df2 = df.ix[:5, ['C', 'D']] + df1 + df2 + df1.join(df2) + df1.join(df2, how='outer') + df1.join(df2, how='inner') + +The data alignment here is on the indexes (row labels). This same behavior can +be achieved using ``merge`` plus additional arguments instructing it to use the +indexes: + +.. ipython:: python + + merge(df1, df2, left_index=True, right_index=True, how='outer') + +Joining key columns on an index +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``join`` takes an optional ``on`` argument which may be a column or multiple +column names, which specifies that the passed DataFrame is to be aligned on +that column in the DataFrame. These two function calls are completely +equivalent: + +:: + + left.join(right, on=key_or_keys) + merge(left, right, left_on=key_or_keys, right_index=True, + how='left', sort=False) + +Obviously you can choose whichever form you find more convenient. For +many-to-one joins (where one of the DataFrame's is already indexed by the join +key), using ``join`` may be more convenient. Here is a simple example: + +.. ipython:: python + + df['key'] = ['foo', 'bar'] * 4 + to_join = DataFrame(randn(2, 2), index=['bar', 'foo'], + columns=['j1', 'j2']) + df + to_join + df.join(to_join, on='key') + merge(df, to_join, left_on='key', right_index=True, + how='left', sort=False) + +.. _merging.multikey_join: + +To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: + +.. ipython:: python + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + data + to_join + +Now this can be joined by passing the two key column names: + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2']) + +.. _merging.df_inner_join: + +The default for ``DataFrame.join`` is to perform a left join (essentially a +"VLOOKUP" operation, for Excel users), which uses only the keys found in the +calling DataFrame. Other join types, for example inner join, can be just as +easily performed: + +.. ipython:: python + + data.join(to_join, on=['key1', 'key2'], how='inner') + +As you can see, this drops any rows where there was no match. + +Overlapping value columns +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The merge ``suffixes`` argument takes a tuple of list of strings to append to +overlapping column names in the input DataFrames to disambiguate the result +columns: + +.. ipython:: python + + left = DataFrame({'key': ['foo', 'foo'], 'value': [1, 2]}) + right = DataFrame({'key': ['foo', 'foo'], 'value': [4, 5]}) + merge(left, right, on='key', suffixes=['_left', '_right']) + +``DataFrame.join`` has ``lsuffix`` and ``rsuffix`` arguments which behave +similarly. + +.. _merging.ordered_merge: + +Merging Ordered Data +~~~~~~~~~~~~~~~~~~~~ + +New in v0.8.0 is the ordered_merge function for combining time series and other +ordered data. In particular it has an optional ``fill_method`` keyword to +fill/interpolate missing data: + +.. ipython:: python + :suppress: + + A = DataFrame({'key' : ['a', 'c', 'e'] * 2, + 'lvalue' : [1, 2, 3] * 2, + 'group' : ['a', 'a', 'a', 'b', 'b', 'b']}) + B = DataFrame({'key' : ['b', 'c', 'd'], + 'rvalue' : [1, 2, 3]}) + +.. ipython:: python + + A + + B + + ordered_merge(A, B, fill_method='ffill', left_by='group') + +.. _merging.multiple_join: + +Joining multiple DataFrame or Panel objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A list or tuple of DataFrames can also be passed to ``DataFrame.join`` to join +them together on their indexes. The same is true for ``Panel.join``. + +.. ipython:: python + + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + df1 + df1.join([df2, df3]) + +.. _merging.combine_first: + +.. _merging.combine_first.update: + +Merging together values within Series or DataFrame columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Another fairly common situation is to have two like-indexed (or similarly +indexed) Series or DataFrame objects and wanting to "patch" values in one +object from values for matching indices in the other. Here is an example: + +.. ipython:: python + + df1 = DataFrame([[nan, 3., 5.], [-4.6, np.nan, nan], + [nan, 7., nan]]) + df2 = DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]], + index=[1, 2]) + +For this, use the ``combine_first`` method: + +.. ipython:: python + + df1.combine_first(df2) + +Note that this method only takes values from the right DataFrame if they are +missing in the left DataFrame. A related method, ``update``, alters non-NA +values inplace: + +.. ipython:: python + + df1.update(df2) + df1 diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst new file mode 100644 index 00000000..d00a19f2 --- /dev/null +++ b/doc/source/missing_data.rst @@ -0,0 +1,369 @@ +.. currentmodule:: pandas +.. _missing_data: + +************************* +Working with missing data +************************* + +In this section, we will discuss missing (also referred to as NA) values in +pandas. + +.. ipython:: python + :suppress: + + import numpy as np; randn = np.random.randn; randint =np.random.randint + from pandas import * + import matplotlib.pyplot as plt + +.. note:: + + The choice of using ``NaN`` internally to denote missing data was largely + for simplicity and performance reasons. It differs from the MaskedArray + approach of, for example, :mod:`scikits.timeseries`. We are hopeful that + NumPy will soon be able to provide a native NA type solution (similar to R) + performant enough to be used in pandas. + +Missing data basics +------------------- + +When / why does data become missing? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some might quibble over our usage of *missing*. By "missing" we simply mean +**null** or "not present for whatever reason". Many data sets simply arrive with +missing data, either because it exists and was not collected or it never +existed. For example, in a collection of financial time series, some of the time +series might start on different dates. Thus, values prior to the start date +would generally be marked as missing. + +In pandas, one of the most common ways that missing data is **introduced** into +a data set is by reindexing. For example + +.. ipython:: python + + df = DataFrame(randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], + columns=['one', 'two', 'three']) + df['four'] = 'bar' + df['five'] = df['one'] > 0 + df + df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']) + df2 + +Values considered "missing" +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As data comes in many shapes and forms, pandas aims to be flexible with regard +to handling missing data. While ``NaN`` is the default missing value marker for +reasons of computational speed and convenience, we need to be able to easily +detect this value with data of different types: floating point, integer, +boolean, and general object. In many cases, however, the Python ``None`` will +arise and we wish to also consider that "missing" or "null". Lastly, for legacy +reasons ``inf`` and ``-inf`` are also considered to be "null" in +computations. Since in NumPy divide-by-zero generates ``inf`` or ``-inf`` and +not ``NaN``, I think you will find this is a worthwhile trade-off (Zen of +Python: "practicality beats purity"). + +.. _missing.isnull: + +To make detecting missing values easier (and across different array dtypes), +pandas provides the :func:`~pandas.core.common.isnull` and +:func:`~pandas.core.common.notnull` functions, which are also methods on +``Series`` objects: + +.. ipython:: python + + df2['one'] + isnull(df2['one']) + df2['four'].notnull() + +**Summary:** ``NaN``, ``inf``, ``-inf``, and ``None`` (in object arrays) are +all considered missing by the ``isnull`` and ``notnull`` functions. + +Calculations with missing data +------------------------------ + +Missing values propagate naturally through arithmetic operations between pandas +objects. + +.. ipython:: python + :suppress: + + df = df2.ix[:, ['one', 'two', 'three']] + a = df2.ix[:5, ['one', 'two']].fillna(method='pad') + b = df2.ix[:5, ['one', 'two', 'three']] + +.. ipython:: python + + a + b + a + b + +The descriptive statistics and computational methods discussed in the +:ref:`data structure overview ` (and listed :ref:`here +` and :ref:`here `) are all written to +account for missing data. For example: + + * When summing data, NA (missing) values will be treated as zero + * If the data are all NA, the result will be NA + * Methods like **cumsum** and **cumprod** ignore NA values, but preserve them + in the resulting arrays + +.. ipython:: python + + df + df['one'].sum() + df.mean(1) + df.cumsum() + +NA values in GroupBy +~~~~~~~~~~~~~~~~~~~~ + +NA groups in GroupBy are automatically excluded. This behavior is consistent +with R, for example. + + + +Cleaning / filling missing data +-------------------------------- + +pandas objects are equipped with various data manipulation methods for dealing +with missing data. + +.. _missing_data.fillna: + +Filling missing values: fillna +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The **fillna** function can "fill in" NA values with non-null data in a couple +of ways, which we illustrate: + +**Replace NA with a scalar value** + +.. ipython:: python + + df2 + df2.fillna(0) + df2['four'].fillna('missing') + +**Fill gaps forward or backward** + +Using the same filling arguments as :ref:`reindexing `, we +can propagate non-null values forward or backward: + +.. ipython:: python + + df + df.fillna(method='pad') + +.. _missing_data.fillna.limit: + +**Limit the amount of filling** + +If we only want consecutive gaps filled up to a certain number of data points, +we can use the `limit` keyword: + +.. ipython:: python + :suppress: + + df.ix[2:4, :] = np.nan + +.. ipython:: python + + df + df.fillna(method='pad', limit=1) + +To remind you, these are the available filling methods: + +.. csv-table:: + :header: "Method", "Action" + :widths: 30, 50 + + pad / ffill, Fill values forward + bfill / backfill, Fill values backward + +With time series data, using pad/ffill is extremely common so that the "last +known value" is available at every time point. + +.. _missing_data.dropna: + +Dropping axis labels with missing data: dropna +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You may wish to simply exclude labels from a data set which refer to missing +data. To do this, use the **dropna** method: + +.. ipython:: python + :suppress: + + df['two'] = df['two'].fillna(0) + df['three'] = df['three'].fillna(0) + +.. ipython:: python + + df + df.dropna(axis=0) + df.dropna(axis=1) + df['one'].dropna() + +**dropna** is presently only implemented for Series and DataFrame, but will be +eventually added to Panel. Series.dropna is a simpler method as it only has one +axis to consider. DataFrame.dropna has considerably more options, which can be +examined :ref:`in the API `. + +.. _missing_data.interpolate: + +Interpolation +~~~~~~~~~~~~~ + +A linear **interpolate** method has been implemented on Series. The default +interpolation assumes equally spaced points. + +.. ipython:: python + :suppress: + + np.random.seed(123456) + idx = date_range('1/1/2000', periods=100, freq='BM') + ts = Series(randn(100), index=idx) + ts[1:20] = np.nan + ts[60:80] = np.nan + ts = ts.cumsum() + +.. ipython:: python + + ts.count() + + ts.head() + + ts.interpolate().count() + + ts.interpolate().head() + + @savefig series_interpolate.png width=6in + fig = plt.figure() + ts.interpolate().plot() + + plt.close('all') + +Index aware interpolation is available via the ``method`` keyword: + +.. ipython:: python + :suppress: + + ts = ts[[0, 1, 30, 60, 99]] + +.. ipython:: python + + ts + + ts.interpolate() + + ts.interpolate(method='time') + +For a floating-point index, use ``method='values'``: + +.. ipython:: python + :suppress: + + idx = [0., 1., 10.] + ser = Series([0., np.nan, 10.], idx) + +.. ipython:: python + + ser + + ser.interpolate() + + ser.interpolate(method='values') + +.. _missing_data.replace: + +Replacing Generic Values +~~~~~~~~~~~~~~~~~~~~~~~~ +Often times we want to replace arbitrary values with other values. New in v0.8 +is the ``replace`` method in Series/DataFrame that provides an efficient yet +flexible way to perform such replacements. + +For a Series, you can replace a single value or a list of values by another +value: + +.. ipython:: python + + ser = Series([0., 1., 2., 3., 4.]) + + ser.replace(0, 5) + +You can replace a list of values by a list of other values: + +.. ipython:: python + + ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + +You can also specify a mapping dict: + +.. ipython:: python + + ser.replace({0: 10, 1: 100}) + +For a DataFrame, you can specify individual values by column: + +.. ipython:: python + + df = DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + + df.replace({'a': 0, 'b': 5}, 100) + +Instead of replacing with specified values, you can treat all given values as +missing and interpolate over them: + +.. ipython:: python + + ser.replace([1, 2, 3], method='pad') + + +Missing data casting rules and indexing +--------------------------------------- + +While pandas supports storing arrays of integer and boolean type, these types +are not capable of storing missing data. Until we can switch to using a native +NA type in NumPy, we've established some "casting rules" when reindexing will +cause missing data to be introduced into, say, a Series or DataFrame. Here they +are: + +.. csv-table:: + :header: "data type", "Cast to" + :widths: 40, 40 + + integer, float + boolean, object + float, no cast + object, no cast + +For example: + +.. ipython:: python + + s = Series(randn(5), index=[0, 2, 4, 6, 7]) + s > 0 + (s > 0).dtype + crit = (s > 0).reindex(range(8)) + crit + crit.dtype + +Ordinarily NumPy will complain if you try to use an object array (even if it +contains boolean values) instead of a boolean array to get or set values from +an ndarray (e.g. selecting values based on some criteria). If a boolean vector +contains NAs, an exception will be generated: + +.. ipython:: python + :okexcept: + + reindexed = s.reindex(range(8)).fillna(0) + reindexed[crit] + +However, these can be filled in using **fillna** and it will work fine: + +.. ipython:: python + + reindexed[crit.fillna(False)] + reindexed[crit.fillna(True)] + diff --git a/doc/source/overview.rst b/doc/source/overview.rst new file mode 100644 index 00000000..4d891d38 --- /dev/null +++ b/doc/source/overview.rst @@ -0,0 +1,121 @@ +.. _overview: + +.. currentmodule:: pandas + +**************** +Package overview +**************** + +:mod:`pandas` consists of the following things + + * A set of labeled array data structures, the primary of which are + Series/TimeSeries and DataFrame + * Index objects enabling both simple axis indexing and multi-level / + hierarchical axis indexing + * An integrated group by engine for aggregating and transforming data sets + * Date range generation (date_range) and custom date offsets enabling the + implementation of customized frequencies + * Input/Output tools: loading tabular data from flat files (CSV, delimited, + Excel 2003), and saving and loading pandas objects from the fast and + efficient PyTables/HDF5 format. + * Memory-efficent "sparse" versions of the standard data structures for storing + data that is mostly missing or mostly constant (some fixed value) + * Moving window statistics (rolling mean, rolling standard deviation, etc.) + * Static and moving window linear and `panel regression + `__ + +Data structures at a glance +--------------------------- + +.. csv-table:: + :header: "Dimensions", "Name", "Description" + :widths: 15, 20, 50 + + 1, Series, "1D labeled homogeneously-typed array" + 1, TimeSeries, "Series with index containing datetimes" + 2, DataFrame, "General 2D labeled, size-mutable tabular structure with + potentially heterogeneously-typed columns" + 3, Panel, "General 3D labeled, also size-mutable array" + +Why more than 1 data structure? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The best way to think about the pandas data structures is as flexible +containers for lower dimensional data. For example, DataFrame is a container +for Series, and Panel is a container for DataFrame objects. We would like to be +able to insert and remove objects from these containers in a dictionary-like +fashion. + +Also, we would like sensible default behaviors for the common API functions +which take into account the typical orientation of time series and +cross-sectional data sets. When using ndarrays to store 2- and 3-dimensional +data, a burden is placed on the user to consider the orientation of the data +set when writing functions; axes are considered more or less equivalent (except +when C- or Fortran-contiguousness matters for performance). In pandas, the axes +are intended to lend more semantic meaning to the data; i.e., for a particular +data set there is likely to be a "right" way to orient the data. The goal, +then, is to reduce the amount of mental effort required to code up data +transformations in downstream functions. + +For example, with tabular data (DataFrame) it is more semantically helpful to +think of the **index** (the rows) and the **columns** rather than axis 0 and +axis 1. And iterating through the columns of the DataFrame thus results in more +readable code: + +:: + + for col in df.columns: + series = df[col] + # do something with series + +Mutability and copying of data +------------------------------ + +All pandas data structures are value-mutable (the values they contain can be +altered) but not always size-mutable. The length of a Series cannot be +changed, but, for example, columns can be inserted into a DataFrame. However, +the vast majority of methods produce new objects and leave the input data +untouched. In general, though, we like to **favor immutability** where +sensible. + +Getting Support +--------------- + +The first stop for pandas issues and ideas is the `Github Issue Tracker +`__. If you have a general question, +pandas community experts can answer through `Stack Overflow +`__. + +Longer discussions occur on the `developer mailing list +`__, and commercial support +inquiries for Lambda Foundry should be sent to: support@lambdafoundry.com + +Credits +------- + +pandas development began at `AQR Capital Management `__ in +April 2008. It was open-sourced at the end of 2009. AQR continued to provide +resources for development through the end of 2011, and continues to contribute +bug reports today. + +Since January 2012, `Lambda Foundry `__, has +been providing development resources, as well as commercial support, +training, and consulting for pandas. + +pandas is only made possible by a group of people around the world like you +who have contributed new code, bug reports, fixes, comments and ideas. A +complete list can be found `on Github `__. + +Development Team +---------------- + +pandas is a part of the PyData project. The PyData Development Team is a +collection of developers focused on the improvement of Python's data +libraries. The core team that coordinates development can be found on `Github +`__. If you're interested in contributing, please +visit the `project website `__. + +License +------- + +.. literalinclude:: ../../LICENSE \ No newline at end of file diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst new file mode 100644 index 00000000..88f48101 --- /dev/null +++ b/doc/source/r_interface.rst @@ -0,0 +1,99 @@ +.. currentmodule:: pandas.rpy + +.. _rpy: + +****************** +rpy2 / R interface +****************** + +.. note:: + + This is all highly experimental. I would like to get more people involved + with building a nice RPy2 interface for pandas + + +If your computer has R and rpy2 (> 2.2) installed (which will be left to the +reader), you will be able to leverage the below functionality. On Windows, +doing this is quite an ordeal at the moment, but users on Unix-like systems +should find it quite easy. rpy2 evolves in time and the current interface is +designed for the 2.2.x series, and we recommend to use over other series +unless you are prepared to fix parts of the code. Released packages are available +in PyPi, but should the latest code in the 2.2.x series be wanted it can be obtained with: + +:: + + # if installing for the first time + hg clone http://bitbucket.org/lgautier/rpy2 + + cd rpy2 + hg pull + hg update version_2.2.x + sudo python setup.py install + +.. note:: + + To use R packages with this interface, you will need to install + them inside R yourself. At the moment it cannot install them for + you. + +Once you have done installed R and rpy2, you should be able to import +``pandas.rpy.common`` without a hitch. + +Transferring R data sets into Python +------------------------------------ + +The **load_data** function retrieves an R data set and converts it to the +appropriate pandas object (most likely a DataFrame): + + +.. ipython:: python + + import pandas.rpy.common as com + infert = com.load_data('infert') + + infert.head() + + +Converting DataFrames into R objects +------------------------------------ + +.. versionadded:: 0.8 + +Starting from pandas 0.8, there is **experimental** support to convert +DataFrames into the equivalent R object (that is, **data.frame**): + +.. ipython:: python + + from pandas import DataFrame + + df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C':[7,8,9]}, + index=["one", "two", "three"]) + r_dataframe = com.convert_to_r_dataframe(df) + + print type(r_dataframe) + print r_dataframe + + +The DataFrame's index is stored as the ``rownames`` attribute of the +data.frame instance. + +You can also use **convert_to_r_matrix** to obtain a ``Matrix`` instance, but +bear in mind that it will only work with homogeneously-typed DataFrames (as +R matrices bear no information on the data type): + + +.. ipython:: python + + r_matrix = com.convert_to_r_matrix(df) + + print type(r_matrix) + print r_matrix + + +Calling R functions with pandas objects +--------------------------------------- + + + +High-level interface to R estimators +------------------------------------ diff --git a/doc/source/related.rst b/doc/source/related.rst new file mode 100644 index 00000000..ff83102c --- /dev/null +++ b/doc/source/related.rst @@ -0,0 +1,47 @@ +************************ +Related Python libraries +************************ + +la (larry) +---------- + +Keith Goodman's excellent `labeled array package +`__ is very similar to pandas in many regards, +though with some key differences. The main philosophical design difference is +to be a wrapper around a single NumPy ``ndarray`` object while adding axis +labeling and label-based operations and indexing. Because of this, creating a +size-mutable object with heterogeneous columns (e.g. DataFrame) is not possible +with the ``la`` package. + + - Provide a single n-dimensional object with labeled axes with functionally + analogous data alignment semantics to pandas objects + - Advanced / label-based indexing similar to that provided in pandas but + setting is not supported + - Stays much closer to NumPy arrays than pandas-- ``larry`` objects must be + homogeneously typed + - GroupBy support is relatively limited, but a few functions are available: + ``group_mean``, ``group_median``, and ``group_ranking`` + - It has a collection of analytical functions suited to quantitative + portfolio construction for financial applications + - It has a collection of moving window statistics implemented in + `Bottleneck `__ + +scikits.statsmodels +------------------- + +The main `statistics and econometrics library +`__ for Python. pandas has become a +dependency of this library. + +scikits.timeseries +------------------ + +`scikits.timeseries `__ provides a data +structure for fixed frequency time series data based on the numpy.MaskedArray +class. For time series data, it provides some of the same functionality to the +pandas Series class. It has many more functions for time series-specific +manipulation. Also, it has support for many more frequencies, though less +customizable by the user (so 5-minutely data is easier to do with pandas for +example). + +We are aiming to merge these libraries together in the near future. diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst new file mode 100644 index 00000000..7f799a8e --- /dev/null +++ b/doc/source/reshaping.rst @@ -0,0 +1,357 @@ +.. currentmodule:: pandas +.. _reshaping: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + from pandas.core.reshape import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + from pandas.tools.tile import * + +************************** +Reshaping and Pivot Tables +************************** + +Reshaping by pivoting DataFrame objects +--------------------------------------- + +.. ipython:: + :suppress: + + In [1]: import pandas.util.testing as tm; tm.N = 3 + + In [2]: def unpivot(frame): + ...: N, K = frame.shape + ...: data = {'value' : frame.values.ravel('F'), + ...: 'variable' : np.asarray(frame.columns).repeat(N), + ...: 'date' : np.tile(np.asarray(frame.index), K)} + ...: columns = ['date', 'variable', 'value'] + ...: return DataFrame(data, columns=columns) + ...: + + In [3]: df = unpivot(tm.makeTimeDataFrame()) + +Data is often stored in CSV files or databases in so-called "stacked" or +"record" format: + +.. ipython:: python + + df + + +For the curious here is how the above DataFrame was created: + +.. code-block:: python + + import pandas.util.testing as tm; tm.N = 3 + def unpivot(frame): + N, K = frame.shape + data = {'value' : frame.values.ravel('F'), + 'variable' : np.asarray(frame.columns).repeat(N), + 'date' : np.tile(np.asarray(frame.index), K)} + return DataFrame(data, columns=['date', 'variable', 'value']) + df = unpivot(tm.makeTimeDataFrame()) + +To select out everything for variable ``A`` we could do: + +.. ipython:: python + + df[df['variable'] == 'A'] + +But suppose we wish to do time series operations with the variables. A better +representation would be where the ``columns`` are the unique variables and an +``index`` of dates identifies individual observations. To reshape the data into +this form, use the ``pivot`` function: + +.. ipython:: python + + df.pivot(index='date', columns='variable', values='value') + +If the ``values`` argument is omitted, and the input DataFrame has more than +one column of values which are not used as column or index inputs to ``pivot``, +then the resulting "pivoted" DataFrame will have :ref:`hierarchical columns +` whose topmost level indicates the respective value +column: + +.. ipython:: python + + df['value2'] = df['value'] * 2 + pivoted = df.pivot('date', 'variable') + pivoted + +You of course can then select subsets from the pivoted DataFrame: + +.. ipython:: python + + pivoted['value2'] + +Note that this returns a view on the underlying data in the case where the data +are homogeneously-typed. + +.. _reshaping.stacking: + +Reshaping by stacking and unstacking +------------------------------------ + +Closely related to the ``pivot`` function are the related ``stack`` and +``unstack`` functions currently available on Series and DataFrame. These +functions are designed to work together with ``MultiIndex`` objects (see the +section on :ref:`hierarchical indexing `). Here are +essentially what these functions do: + + - ``stack``: "pivot" a level of the (possibly hierarchical) column labels, + returning a DataFrame with an index with a new inner-most level of row + labels. + - ``unstack``: inverse operation from ``stack``: "pivot" a level of the + (possibly hierarchical) row index to the column axis, producing a reshaped + DataFrame with a new inner-most level of column labels. + +The clearest way to explain is by example. Let's take a prior example data set +from the hierarchical indexing section: + +.. ipython:: python + + tuples = zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']]) + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + df = DataFrame(randn(8, 2), index=index, columns=['A', 'B']) + df2 = df[:4] + df2 + +The ``stack`` function "compresses" a level in the DataFrame's columns to +produce either: + + - A Series, in the case of a simple column Index + - A DataFrame, in the case of a ``MultiIndex`` in the columns + +If the columns have a ``MultiIndex``, you can choose which level to stack. The +stacked level becomes the new lowest level in a ``MultiIndex`` on the columns: + +.. ipython:: python + + stacked = df2.stack() + stacked + +With a "stacked" DataFrame or Series (having a ``MultiIndex`` as the +``index``), the inverse operation of ``stack`` is ``unstack``, which by default +unstacks the **last level**: + +.. ipython:: python + + stacked.unstack() + stacked.unstack(1) + stacked.unstack(0) + +.. _reshaping.unstack_by_name: + +If the indexes have names, you can use the level names instead of specifying +the level numbers: + +.. ipython:: python + + stacked.unstack('second') + +You may also stack or unstack more than one level at a time by passing a list +of levels, in which case the end result is as if each level in the list were +processed individually. + +These functions are intelligent about handling missing data and do not expect +each subgroup within the hierarchical index to have the same set of labels. +They also can handle the index being unsorted (but you can make it sorted by +calling ``sortlevel``, of course). Here is a more complex example: + +.. ipython:: python + + columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), + ('B', 'cat'), ('A', 'dog')], + names=['exp', 'animal']) + df = DataFrame(randn(8, 4), index=index, columns=columns) + df2 = df.ix[[0, 1, 2, 4, 5, 7]] + df2 + +As mentioned above, ``stack`` can be called with a ``level`` argument to select +which level in the columns to stack: + +.. ipython:: python + + df2.stack('exp') + df2.stack('animal') + +Unstacking when the columns are a ``MultiIndex`` is also careful about doing +the right thing: + +.. ipython:: python + + df[:3].unstack(0) + df2.unstack(1) + +.. _reshaping.melt: + +Reshaping by Melt +----------------- + +The ``melt`` function found in ``pandas.core.reshape`` is useful to massage a +DataFrame into a format where one or more columns are identifier variables, +while all other columns, considered measured variables, are "pivoted" to the +row axis, leaving just two non-identifier columns, "variable" and "value". + +For instance, + +.. ipython:: python + + cheese = DataFrame({'first' : ['John', 'Mary'], + 'last' : ['Doe', 'Bo'], + 'height' : [5.5, 6.0], + 'weight' : [130, 150]}) + cheese + melt(cheese, id_vars=['first', 'last']) + +Combining with stats and GroupBy +-------------------------------- + +It should be no shock that combining ``pivot`` / ``stack`` / ``unstack`` with +GroupBy and the basic Series and DataFrame statistical functions can produce +some very expressive and fast data manipulations. + +.. ipython:: python + + df + df.stack().mean(1).unstack() + + # same result, another way + df.groupby(level=1, axis=1).mean() + + df.stack().groupby(level=1).mean() + + df.mean().unstack(0) + + +Pivot tables and cross-tabulations +---------------------------------- + +.. _reshaping.pivot: + +The function ``pandas.pivot_table`` can be used to create spreadsheet-style pivot +tables. It takes a number of arguments + +- ``data``: A DataFrame object +- ``values``: a column or a list of columns to aggregate +- ``rows``: list of columns to group by on the table rows +- ``cols``: list of columns to group by on the table columns +- ``aggfunc``: function to use for aggregation, defaulting to ``numpy.mean`` + +Consider a data set like this: + +.. ipython:: python + + df = DataFrame({'A' : ['one', 'one', 'two', 'three'] * 6, + 'B' : ['A', 'B', 'C'] * 8, + 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, + 'D' : np.random.randn(24), + 'E' : np.random.randn(24)}) + df + +We can produce pivot tables from this data very easily: + +.. ipython:: python + + pivot_table(df, values='D', rows=['A', 'B'], cols=['C']) + pivot_table(df, values='D', rows=['B'], cols=['A', 'C'], aggfunc=np.sum) + pivot_table(df, values=['D','E'], rows=['B'], cols=['A', 'C'], aggfunc=np.sum) + +The result object is a DataFrame having potentially hierarchical indexes on the +rows and columns. If the ``values`` column name is not given, the pivot table +will include all of the data that can be aggregated in an additional level of +hierarchy in the columns: + +.. ipython:: python + + pivot_table(df, rows=['A', 'B'], cols=['C']) + +You can render a nice output of the table omitting the missing values by +calling ``to_string`` if you wish: + +.. ipython:: python + + table = pivot_table(df, rows=['A', 'B'], cols=['C']) + print table.to_string(na_rep='') + +Note that ``pivot_table`` is also available as an instance method on DataFrame. + +Cross tabulations +~~~~~~~~~~~~~~~~~ + +Use the ``crosstab`` function to compute a cross-tabulation of two (or more) +factors. By default ``crosstab`` computes a frequency table of the factors +unless an array of values and an aggregation function are passed. + +It takes a number of arguments + +- ``rows``: array-like, values to group by in the rows +- ``cols``: array-like, values to group by in the columns +- ``values``: array-like, optional, array of values to aggregate according to + the factors +- ``aggfunc``: function, optional, If no values array is passed, computes a + frequency table +- ``rownames``: sequence, default None, must match number of row arrays passed +- ``colnames``: sequence, default None, if passed, must match number of column + arrays passed +- ``margins``: boolean, default False, Add row/column margins (subtotals) + +Any Series passed will have their name attributes used unless row or column +names for the cross-tabulation are specified + +For example: + +.. ipython:: python + + foo, bar, dull, shiny, one, two = 'foo', 'bar', 'dull', 'shiny', 'one', 'two' + a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) + b = np.array([one, one, two, one, two, one], dtype=object) + c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) + crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + +.. _reshaping.pivot.margins: + +Adding margins (partial aggregates) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you pass ``margins=True`` to ``pivot_table``, special ``All`` columns and +rows will be added with partial group aggregates across the categories on the +rows and columns: + +.. ipython:: python + + df.pivot_table(rows=['A', 'B'], cols='C', margins=True, aggfunc=np.std) + +.. _reshaping.tile: + +Tiling +------ + +.. _reshaping.tile.cut: + +The ``cut`` function computes groupings for the values of the input array and +is often used to transform continuous variables to discrete or categorical +variables: + +.. ipython:: python + + ages = np.array([10, 15, 13, 12, 23, 25, 28, 59, 60]) + + + cut(ages, bins=3) + +If the ``bins`` keyword is an integer, then equal-width bins are formed. +Alternatively we can specify custom bin-edges: + +.. ipython:: python + + cut(ages, bins=[0, 18, 35, 70]) diff --git a/doc/source/sparse.rst b/doc/source/sparse.rst new file mode 100644 index 00000000..fe3e4ead --- /dev/null +++ b/doc/source/sparse.rst @@ -0,0 +1,135 @@ +.. currentmodule:: pandas +.. _sparse: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + +********************** +Sparse data structures +********************** + +We have implemented "sparse" versions of Series, DataFrame, and Panel. These +are not sparse in the typical "mostly 0". You can view these objects as being +"compressed" where any data matching a specific value (NaN/missing by default, +though any value can be chosen) is omitted. A special ``SparseIndex`` object +tracks where data has been "sparsified". This will make much more sense in an +example. All of the standard pandas data structures have a ``to_sparse`` +method: + +.. ipython:: python + + ts = Series(randn(10)) + ts[2:-2] = np.nan + sts = ts.to_sparse() + sts + +The ``to_sparse`` method takes a ``kind`` argument (for the sparse index, see +below) and a ``fill_value``. So if we had a mostly zero Series, we could +convert it to sparse with ``fill_value=0``: + +.. ipython:: python + + ts.fillna(0).to_sparse(fill_value=0) + +The sparse objects exist for memory efficiency reasons. Suppose you had a +large, mostly NA DataFrame: + +.. ipython:: python + + df = DataFrame(randn(10000, 4)) + df.ix[:9998] = np.nan + sdf = df.to_sparse() + sdf + sdf.density + +As you can see, the density (% of values that have not been "compressed") is +extremely low. This sparse object takes up much less memory on disk (pickled) +and in the Python interpreter. Functionally, their behavior should be nearly +identical to their dense counterparts. + +Any sparse object can be converted back to the standard dense form by calling +``to_dense``: + +.. ipython:: python + + sts.to_dense() + +.. _sparse.array: + +SparseArray +----------- + +``SparseArray`` is the base layer for all of the sparse indexed data +structures. It is a 1-dimensional ndarray-like object storing only values +distinct from the ``fill_value``: + +.. ipython:: python + + arr = np.random.randn(10) + arr[2:5] = np.nan; arr[7:8] = np.nan + sparr = SparseArray(arr) + sparr + +Like the indexed objects (SparseSeries, SparseDataFrame, SparsePanel), a +``SparseArray`` can be converted back to a regular ndarray by calling +``to_dense``: + +.. ipython:: python + + sparr.to_dense() + +.. _sparse.list: + +SparseList +---------- + +``SparseList`` is a list-like data structure for managing a dynamic collection +of SparseArrays. To create one, simply call the ``SparseList`` constructor with +a ``fill_value`` (defaulting to ``NaN``): + +.. ipython:: python + + spl = SparseList() + spl + +The two important methods are ``append`` and ``to_array``. ``append`` can +accept scalar values or any 1-dimensional sequence: + +.. ipython:: python + :suppress: + + from numpy import nan + +.. ipython:: python + + spl.append(np.array([1., nan, nan, 2., 3.])) + spl.append(5) + spl.append(sparr) + spl + +As you can see, all of the contents are stored internally as a list of +memory-efficient ``SparseArray`` objects. Once you've accumulated all of the +data, you can call ``to_array`` to get a single ``SparseArray`` with all the +data: + +.. ipython:: python + + spl.to_array() + +SparseIndex objects +------------------- + +Two kinds of ``SparseIndex`` are implemented, ``block`` and ``integer``. We +recommend using ``block`` as it's more memory efficient. The ``integer`` format +keeps an arrays of all of the locations where the data are not equal to the +fill value. The ``block`` format tracks only the locations and sizes of blocks +of data. diff --git a/doc/source/themes/agogo/layout.html b/doc/source/themes/agogo/layout.html new file mode 100644 index 00000000..cd0f3d7f --- /dev/null +++ b/doc/source/themes/agogo/layout.html @@ -0,0 +1,95 @@ +{# + agogo/layout.html + ~~~~~~~~~~~~~~~~~ + + Sphinx layout template for the agogo theme, originally written + by Andi Albrecht. + + :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. +#} +{% extends "basic/layout.html" %} + +{% block header %} +
+
+ {%- if logo %} + + {%- endif %} + {%- block headertitle %} +

{{ shorttitle|e }}

+ {%- endblock %} +
+ {%- for rellink in rellinks|reverse %} + {{ rellink[3] }} + {%- if not loop.last %}{{ reldelim2 }}{% endif %} + {%- endfor %} +
+
+
+{% endblock %} + +{% block content %} +
+
+ +
+ {%- block document %} + {{ super() }} + {%- endblock %} +
+
+
+
+{% endblock %} + +{% block footer %} + +{% endblock %} + +{% block relbar1 %}{% endblock %} +{% block relbar2 %}{% endblock %} diff --git a/doc/source/themes/agogo/static/agogo.css_t b/doc/source/themes/agogo/static/agogo.css_t new file mode 100644 index 00000000..ef909b72 --- /dev/null +++ b/doc/source/themes/agogo/static/agogo.css_t @@ -0,0 +1,476 @@ +/* + * agogo.css_t + * ~~~~~~~~~~~ + * + * Sphinx stylesheet -- agogo theme. + * + * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +* { + margin: 0px; + padding: 0px; +} + +body { + font-family: {{ theme_bodyfont }}; + line-height: 1.4em; + color: black; + background-color: {{ theme_bgcolor }}; +} + + +/* Page layout */ + +div.header, div.content, div.footer { + max-width: {{ theme_pagewidth }}; + margin-left: auto; + margin-right: auto; +} + +div.header-wrapper { + background: {{ theme_headerbg }}; + padding: 1em 1em 0; + border-bottom: 3px solid #2e3436; + min-height: 0px; +} + + +/* Default body styles */ +a { + color: {{ theme_linkcolor }}; +} + +div.bodywrapper a, div.footer a { + text-decoration: underline; +} + +.clearer { + clear: both; +} + +.left { + float: left; +} + +.right { + float: right; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +h1, h2, h3, h4 { + font-family: {{ theme_headerfont }}; + font-weight: normal; + color: {{ theme_headercolor2 }}; + margin-bottom: .8em; +} + +h1 { + color: {{ theme_headercolor1 }}; +} + +h2 { + padding-bottom: .5em; + border-bottom: 1px solid {{ theme_headercolor2 }}; +} + +a.headerlink { + visibility: hidden; + color: #dddddd; + padding-left: .3em; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink { + visibility: visible; +} + +img { + border: 0; +} + +pre { + background-color: #EEE; + padding: 0.5em; +} + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 2px 7px 1px 7px; + border-left: 0.2em solid black; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +dt:target, .highlighted { + background-color: #fbe54e; +} + +/* Header */ + +/* +div.header { + padding-top: 10px; + padding-bottom: 10px; +} +*/ + +div.header {} + +div.header h1 { + font-family: {{ theme_headerfont }}; + font-weight: normal; + font-size: 180%; + letter-spacing: .08em; +} + +div.header h1 a { + color: white; +} + +div.header div.rel { + text-decoration: none; +} +/* margin-top: 1em; */ + +div.header div.rel a { + margin-top: 1em; + color: {{ theme_headerlinkcolor }}; + letter-spacing: .1em; + text-transform: uppercase; + padding: 3px 1em; +} + +p.logo { + float: right; +} + +img.logo { + border: 0; +} + + +/* Content */ +div.content-wrapper { + background-color: white; + padding: 1em; +} +/* + padding-top: 20px; + padding-bottom: 20px; +*/ + +/* float: left; */ + +div.document { + max-width: {{ theme_documentwidth }}; +} + +div.body { + padding-right: 2em; + text-align: {{ theme_textalign }}; +} + +div.document ul { + margin: 1.5em; + list-style-type: square; +} + +div.document dd { + margin-left: 1.2em; + margin-top: .4em; + margin-bottom: 1em; +} + +div.document .section { + margin-top: 1.7em; +} +div.document .section:first-child { + margin-top: 0px; +} + +div.document div.highlight { + padding: 3px; + background-color: #eeeeec; + border-top: 2px solid #dddddd; + border-bottom: 2px solid #dddddd; + margin-top: .8em; + margin-bottom: .8em; +} + +div.document h2 { + margin-top: .7em; +} + +div.document p { + margin-bottom: .5em; +} + +div.document li.toctree-l1 { + margin-bottom: 1em; +} + +div.document .descname { + font-weight: bold; +} + +div.document .docutils.literal { + background-color: #eeeeec; + padding: 1px; +} + +div.document .docutils.xref.literal { + background-color: transparent; + padding: 0px; +} + +div.document blockquote { + margin: 1em; +} + +div.document ol { + margin: 1.5em; +} + + +/* Sidebar */ + + +div.sidebar { + width: {{ theme_sidebarwidth }}; + padding: 0 1em; + float: right; + font-size: .93em; +} + +div.sidebar a, div.header a { + text-decoration: none; +} + +div.sidebar a:hover, div.header a:hover { + text-decoration: underline; +} + +div.sidebar h3 { + color: #2e3436; + text-transform: uppercase; + font-size: 130%; + letter-spacing: .1em; +} + +div.sidebar ul { + list-style-type: none; +} + +div.sidebar li.toctree-l1 a { + display: block; + padding: 1px; + border: 1px solid #dddddd; + background-color: #eeeeec; + margin-bottom: .4em; + padding-left: 3px; + color: #2e3436; +} + +div.sidebar li.toctree-l2 a { + background-color: transparent; + border: none; + margin-left: 1em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l3 a { + background-color: transparent; + border: none; + margin-left: 2em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l2:last-child a { + border-bottom: none; +} + +div.sidebar li.toctree-l1.current a { + border-right: 5px solid {{ theme_headerlinkcolor }}; +} + +div.sidebar li.toctree-l1.current li.toctree-l2 a { + border-right: none; +} + + +/* Footer */ + +div.footer-wrapper { + background: {{ theme_footerbg }}; + border-top: 4px solid #babdb6; + padding-top: 10px; + padding-bottom: 10px; + min-height: 80px; +} + +div.footer, div.footer a { + color: #888a85; +} + +div.footer .right { + text-align: right; +} + +div.footer .left { + text-transform: uppercase; +} + + +/* Styles copied from basic theme */ + +img.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + clear: both; + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li div.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable dl, table.indextable dd { + margin-top: 0; + margin-bottom: 0; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +/* -- viewcode extension ---------------------------------------------------- */ + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family:: {{ theme_bodyfont }}; +} + +div.viewcode-block:target { + margin: -1px -3px; + padding: 0 3px; + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +th.field-name { + white-space: nowrap; +} diff --git a/doc/source/themes/agogo/static/bgfooter.png b/doc/source/themes/agogo/static/bgfooter.png new file mode 100644 index 0000000000000000000000000000000000000000..9ce5bdd902943fdf8b0c0ca6a545297e1e2cc665 GIT binary patch literal 434 zcmV;j0ZsmiP)Px#24YJ`L;%wO*8tD73qoQ5000SaNLh0L01FcU01FcV0GgZ_00007bV*G`2iXD> z2Q(2CT#42I000?uMObu0Z*6U5Zgc=ca%Ew3Wn>_CX>@2HM@dakSAh-}0003ENklR?sq9~H`=l5UI-{JW_f9!)=Hwush3JC}Y z1gFM&r>$lJNPt^*1k!w;l|obx>lr$2IOaI$n=(gBBaj^I0=y%@K5N&GIU&-%OE_~V zX=m=_j7d`hvubQRuF+xT63vIfWnC3%kKN*T3l7ob3nEC2R->wU1Y)4)(7_t^thiqb zj$CO7xBn9gg`*!MY$}SI|_*)!a*&V0w7h>cUb&$Grh37iJ=C%Yn c>}w1E0Z4f>1OEiDlmGw#07*qoM6N<$g4BwtIsgCw literal 0 HcmV?d00001 diff --git a/doc/source/themes/agogo/static/bgtop.png b/doc/source/themes/agogo/static/bgtop.png new file mode 100644 index 0000000000000000000000000000000000000000..a0d4709bac8f79943a817195c086461c8c4d5419 GIT binary patch literal 430 zcmV;f0a5;mP)Px#24YJ`L;zI)R{&FzA;Z4_000SaNLh0L01FcU01FcV0GgZ_00007bV*G`2iXD> z2Q3AZhV-)l000?uMObu0Z*6U5Zgc=ca%Ew3Wn>_CX>@2HM@dakSAh-}0003ANklMo8vqN`cM=KwSQV|n zk}naE+VzlN;kK@Ej${PSkI$-R6-Yfp`zA;^O$`)7`gRi{-0i?owGIbX{p>Nc##93U z;sA|ayOYkG%F9M0iEMUM*s3NDYSS=KN2ht8Rv|7nv77i{NTO47R)}V_+2H~mL-nTR z_8j}*%6Qm8?#7NU2kM$#gcP&kO?iw|n}ynz+r-~FA9nKcZnfixWvZ&d28Cc_6&_Pe zMpbjI>9r+<=}NIDz4mCd3U++H?rrHcYxH&eeB|)>mnv*N#44ILM2zL6yU!VVWSrgp Y0Yu&#qm)=by8r+H07*qoM6N<$f@HC)j{pDw literal 0 HcmV?d00001 diff --git a/doc/source/themes/agogo/theme.conf b/doc/source/themes/agogo/theme.conf new file mode 100644 index 00000000..3fc88580 --- /dev/null +++ b/doc/source/themes/agogo/theme.conf @@ -0,0 +1,19 @@ +[theme] +inherit = basic +stylesheet = agogo.css +pygments_style = tango + +[options] +bodyfont = "Verdana", Arial, sans-serif +headerfont = "Georgia", "Times New Roman", serif +pagewidth = 70em +documentwidth = 50em +sidebarwidth = 20em +bgcolor = #eeeeec +headerbg = url(bgtop.png) top left repeat-x +footerbg = url(bgfooter.png) top left repeat-x +linkcolor = #ce5c00 +headercolor1 = #204a87 +headercolor2 = #3465a4 +headerlinkcolor = #fcaf3e +textalign = justify \ No newline at end of file diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst new file mode 100644 index 00000000..ac62bae5 --- /dev/null +++ b/doc/source/timeseries.rst @@ -0,0 +1,913 @@ +.. currentmodule:: pandas +.. _timeseries: + +.. ipython:: python + :suppress: + + from datetime import datetime + import numpy as np + np.random.seed(123456) + from pandas import * + randn = np.random.randn + randint = np.random.randint + np.set_printoptions(precision=4, suppress=True) + from dateutil.relativedelta import relativedelta + from pandas.tseries.api import * + from pandas.tseries.offsets import * + +******************************** +Time Series / Date functionality +******************************** + +pandas has proven very successful as a tool for working with time series data, +especially in the financial data analysis space. With the 0.8 release, we have +further improved the time series API in pandas by leaps and bounds. Using the +new NumPy ``datetime64`` dtype, we have consolidated a large number of features +from other Python libraries like ``scikits.timeseries`` as well as created +a tremendous amount of new functionality for manipulating time series data. + +In working with time series data, we will frequently seek to: + + - generate sequences of fixed-frequency dates and time spans + - conform or convert time series to a particular frequency + - compute "relative" dates based on various non-standard time increments + (e.g. 5 business days before the last business day of the year), or "roll" + dates forward or backward + +pandas provides a relatively compact and self-contained set of tools for +performing the above tasks. + +Create a range of dates: + +.. ipython:: python + + # 72 hours starting with midnight Jan 1st, 2011 + rng = date_range('1/1/2011', periods=72, freq='H') + rng[:5] + +Index pandas objects with dates: + +.. ipython:: python + + ts = Series(randn(len(rng)), index=rng) + ts.head() + +Change frequency and fill gaps: + +.. ipython:: python + + # to 45 minute frequency and forward fill + converted = ts.asfreq('45Min', method='pad') + converted.head() + +Resample: + +.. ipython:: python + + # Daily means + ts.resample('D', how='mean') + + +.. _timeseries.representation: + +Time Stamps vs. Time Spans +-------------------------- + +Time-stamped data is the most basic type of timeseries data that associates +values with points in time. For pandas objects it means using the points in +time to create the index + +.. ipython:: python + + dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + ts = Series(np.random.randn(3), dates) + + type(ts.index) + + ts + +However, in many cases it is more natural to associate things like change +variables with a time span instead. + +For example: + +.. ipython:: python + + periods = PeriodIndex([Period('2012-01'), Period('2012-02'), + Period('2012-03')]) + + ts = Series(np.random.randn(3), periods) + + type(ts.index) + + ts + +Starting with 0.8, pandas allows you to capture both representations and +convert between them. Under the hood, pandas represents timestamps using +instances of ``Timestamp`` and sequences of timestamps using instances of +``DatetimeIndex``. For regular time spans, pandas uses ``Period`` objects for +scalar values and ``PeriodIndex`` for sequences of spans. Better support for +irregular intervals with arbitrary start and end points are forth-coming in +future releases. + +.. _timeseries.daterange: + +Generating Ranges of Timestamps +------------------------------- + +To generate an index with time stamps, you can use either the DatetimeIndex or +Index constructor and pass in a list of datetime objects: + +.. ipython:: python + + dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + index = DatetimeIndex(dates) + index # Note the frequency information + + index = Index(dates) + index # Automatically converted to DatetimeIndex + +Practically, this becomes very cumbersome because we often need a very long +index with a large number of timestamps. If we need timestamps on a regular +frequency, we can use the pandas functions ``date_range`` and ``bdate_range`` +to create timestamp indexes. + +.. ipython:: python + + index = date_range('2000-1-1', periods=1000, freq='M') + index + + index = bdate_range('2012-1-1', periods=250) + index + +Convenience functions like ``date_range`` and ``bdate_range`` utilizes a +variety of frequency aliases. The default frequency for ``date_range`` is a +**calendar day** while the default for ``bdate_range`` is a **business day** + +.. ipython:: python + + start = datetime(2011, 1, 1) + end = datetime(2012, 1, 1) + + rng = date_range(start, end) + rng + + rng = bdate_range(start, end) + rng + +``date_range`` and ``bdate_range`` makes it easy to generate a range of dates +using various combinations of its parameters like ``start``, ``end``, +``periods``, and ``freq``: + +.. ipython:: python + + date_range(start, end, freq='BM') + + date_range(start, end, freq='W') + + bdate_range(end=end, periods=20) + + bdate_range(start=start, periods=20) + +The start and end dates are strictly inclusive. So it will not generate any +dates outside of those dates if specified. + +.. _timeseries.datetimeindex: + +DatetimeIndex +~~~~~~~~~~~~~ + +One of the main uses for ``DatetimeIndex`` is as an index for pandas objects. +The ``DatetimeIndex`` class contains many timeseries related optimizations: + + - A large range of dates for various offsets are pre-computed and cached + under the hood in order to make generating subsequent date ranges very fast + (just have to grab a slice) + - Fast shifting using the ``shift`` and ``tshift`` method on pandas objects + - Unioning of overlapping DatetimeIndex objects with the same frequency is + very fast (important for fast data alignment) + - Quick access to date fields via properties such as ``year``, ``month``, etc. + - Regularization functions like ``snap`` and very fast ``asof`` logic + +``DatetimeIndex`` can be used like a regular index and offers all of its +intelligent functionality like selection, slicing, etc. + +.. ipython:: python + + rng = date_range(start, end, freq='BM') + ts = Series(randn(len(rng)), index=rng) + ts.index + ts[:5].index + ts[::2].index + +You can pass in dates and strings that parses to dates as indexing parameters: + +.. ipython:: python + + ts['1/31/2011'] + + ts[datetime(2011, 12, 25):] + + ts['10/31/2011':'12/31/2011'] + +A ``truncate`` convenience function is provided that is equivalent to slicing: + +.. ipython:: python + + ts.truncate(before='10/31/2011', after='12/31/2011') + +To provide convenience for accessing longer time series, you can also pass in +the year or year and month as strings: + +.. ipython:: python + + ts['2011'] + + ts['2011-6'] + +Even complicated fancy indexing that breaks the DatetimeIndex's frequency +regularity will result in a ``DatetimeIndex`` (but frequency is lost): + +.. ipython:: python + + ts[[0, 2, 6]].index + +DatetimeIndex objects has all the basic functionality of regular Index objects +and a smorgasbord of advanced timeseries-specific methods for easy frequency +processing. + +.. seealso:: + :ref:`Reindexing methods ` + +.. note:: + + While pandas does not force you to have a sorted date index, some of these + methods may have unexpected or incorrect behavior if the dates are + unsorted. So please be careful. + + +.. _timeseries.offsets: + +DateOffset objects +------------------ + +In the preceding examples, we created DatetimeIndex objects at various +frequencies by passing in frequency strings like 'M', 'W', and 'BM to the +``freq`` keyword. Under the hood, these frequency strings are being translated +into an instance of pandas ``DateOffset``, which represents a regular +frequency increment. Specific offset logic like "month", "business day", or +"one hour" is represented in its various subclasses. + +.. csv-table:: + :header: "Class name", "Description" + :widths: 15, 65 + + DateOffset, "Generic offset class, defaults to 1 calendar day" + BDay, "business day (weekday)" + Week, "one week, optionally anchored on a day of the week" + WeekOfMonth, "the x-th day of the y-th week of each month" + MonthEnd, "calendar month end" + MonthBegin, "calendar month begin" + BMonthEnd, "business month end" + BMonthBegin, "business month begin" + QuarterEnd, "calendar quarter end" + QuarterBegin, "calendar quarter begin" + BQuarterEnd, "business quarter end" + BQuarterBegin, "business quarter begin" + YearEnd, "calendar year end" + YearBegin, "calendar year begin" + BYearEnd, "business year end" + BYearBegin, "business year begin" + Hour, "one hour" + Minute, "one minute" + Second, "one second" + Milli, "one millisecond" + Micro, "one microsecond" + + +The basic ``DateOffset`` takes the same arguments as +``dateutil.relativedelta``, which works like: + +.. ipython:: python + + d = datetime(2008, 8, 18) + d + relativedelta(months=4, days=5) + +We could have done the same thing with ``DateOffset``: + +.. ipython:: python + + from pandas.tseries.offsets import * + d + DateOffset(months=4, days=5) + +The key features of a ``DateOffset`` object are: + + - it can be added / subtracted to/from a datetime object to obtain a + shifted date + - it can be multiplied by an integer (positive or negative) so that the + increment will be applied multiple times + - it has ``rollforward`` and ``rollback`` methods for moving a date forward + or backward to the next or previous "offset date" + +Subclasses of ``DateOffset`` define the ``apply`` function which dictates +custom date increment logic, such as adding business days: + +.. code-block:: python + + class BDay(DateOffset): + """DateOffset increments between business days""" + def apply(self, other): + ... + +.. ipython:: python + + d - 5 * BDay() + d + BMonthEnd() + +The ``rollforward`` and ``rollback`` methods do exactly what you would expect: + +.. ipython:: python + + d + offset = BMonthEnd() + offset.rollforward(d) + offset.rollback(d) + +It's definitely worth exploring the ``pandas.tseries.offsets`` module and the +various docstrings for the classes. + +Parametric offsets +~~~~~~~~~~~~~~~~~~ + +Some of the offsets can be "parameterized" when created to result in different +behavior. For example, the ``Week`` offset for generating weekly data accepts a +``weekday`` parameter which results in the generated dates always lying on a +particular day of the week: + +.. ipython:: python + + d + Week() + d + Week(weekday=4) + (d + Week(weekday=4)).weekday() + +Another example is parameterizing ``YearEnd`` with the specific ending month: + +.. ipython:: python + + d + YearEnd() + d + YearEnd(month=6) + +.. _timeseries.alias: + +Offset Aliases +~~~~~~~~~~~~~~ + +A number of string aliases are given to useful common time series +frequencies. We will refer to these aliases as *offset aliases* +(referred to as *time rules* prior to v0.8.0). + +.. csv-table:: + :header: "Alias", "Description" + :widths: 15, 100 + + "B", "business day frequency" + "D", "calendar day frequency" + "W", "weekly frequency" + "M", "month end frequency" + "BM", "business month end frequency" + "MS", "month start frequency" + "BMS", "business month start frequency" + "Q", "quarter end frequency" + "BQ", "business quarter endfrequency" + "QS", "quarter start frequency" + "BQS", "business quarter start frequency" + "A", "year end frequency" + "BA", "business year end frequency" + "AS", "year start frequency" + "BAS", "business year start frequency" + "H", "hourly frequency" + "T", "minutely frequency" + "S", "secondly frequency" + "L", "milliseonds" + "U", "microseconds" + +Combining Aliases +~~~~~~~~~~~~~~~~~ + +As we have seen previously, the alias and the offset instance are fungible in +most functions: + +.. ipython:: python + + date_range(start, periods=5, freq='B') + + date_range(start, periods=5, freq=BDay()) + +You can combine together day and intraday offsets: + +.. ipython:: python + + date_range(start, periods=10, freq='2h20min') + + date_range(start, periods=10, freq='1D10U') + +Anchored Offsets +~~~~~~~~~~~~~~~~ + +For some frequencies you can specify an anchoring suffix: + +.. csv-table:: + :header: "Alias", "Description" + :widths: 15, 100 + + "W\-SUN", "weekly frequency (sundays). Same as 'W'" + "W\-MON", "weekly frequency (mondays)" + "W\-TUE", "weekly frequency (tuesdays)" + "W\-WED", "weekly frequency (wednesdays)" + "W\-THU", "weekly frequency (thursdays)" + "W\-FRI", "weekly frequency (fridays)" + "W\-SAT", "weekly frequency (saturdays)" + "(B)Q(S)\-DEC", "quarterly frequency, year ends in December. Same as 'Q'" + "(B)Q(S)\-JAN", "quarterly frequency, year ends in January" + "(B)Q(S)\-FEB", "quarterly frequency, year ends in February" + "(B)Q(S)\-MAR", "quarterly frequency, year ends in March" + "(B)Q(S)\-APR", "quarterly frequency, year ends in April" + "(B)Q(S)\-MAY", "quarterly frequency, year ends in May" + "(B)Q(S)\-JUN", "quarterly frequency, year ends in June" + "(B)Q(S)\-JUL", "quarterly frequency, year ends in July" + "(B)Q(S)\-AUG", "quarterly frequency, year ends in August" + "(B)Q(S)\-SEP", "quarterly frequency, year ends in September" + "(B)Q(S)\-OCT", "quarterly frequency, year ends in October" + "(B)Q(S)\-NOV", "quarterly frequency, year ends in November" + "(B)A(S)\-DEC", "annual frequency, anchored end of December. Same as 'A'" + "(B)A(S)\-JAN", "annual frequency, anchored end of January" + "(B)A(S)\-FEB", "annual frequency, anchored end of February" + "(B)A(S)\-MAR", "annual frequency, anchored end of March" + "(B)A(S)\-APR", "annual frequency, anchored end of April" + "(B)A(S)\-MAY", "annual frequency, anchored end of May" + "(B)A(S)\-JUN", "annual frequency, anchored end of June" + "(B)A(S)\-JUL", "annual frequency, anchored end of July" + "(B)A(S)\-AUG", "annual frequency, anchored end of August" + "(B)A(S)\-SEP", "annual frequency, anchored end of September" + "(B)A(S)\-OCT", "annual frequency, anchored end of October" + "(B)A(S)\-NOV", "annual frequency, anchored end of November" + +These can be used as arguments to ``date_range``, ``bdate_range``, constructors +for ``DatetimeIndex``, as well as various other timeseries-related functions +in pandas. + +Legacy Aliases +~~~~~~~~~~~~~~ +Note that prior to v0.8.0, time rules had a slightly different look. Pandas +will continue to support the legacy time rules for the time being but it is +strongly recommended that you switch to using the new offset aliases. + +.. csv-table:: + :header: "Legacy Time Rule", "Offset Alias" + :widths: 15, 65 + + "WEEKDAY", "B" + "EOM", "BM" + "W\@MON", "W\-MON" + "W\@TUE", "W\-TUE" + "W\@WED", "W\-WED" + "W\@THU", "W\-THU" + "W\@FRI", "W\-FRI" + "W\@SAT", "W\-SAT" + "W\@SUN", "W\-SUN" + "Q\@JAN", "BQ\-JAN" + "Q\@FEB", "BQ\-FEB" + "Q\@MAR", "BQ\-MAR" + "A\@JAN", "BA\-JAN" + "A\@FEB", "BA\-FEB" + "A\@MAR", "BA\-MAR" + "A\@APR", "BA\-APR" + "A\@MAY", "BA\-MAY" + "A\@JUN", "BA\-JUN" + "A\@JUL", "BA\-JUL" + "A\@AUG", "BA\-AUG" + "A\@SEP", "BA\-SEP" + "A\@OCT", "BA\-OCT" + "A\@NOV", "BA\-NOV" + "A\@DEC", "BA\-DEC" + "min", "T" + "ms", "L" + "us": "U" + +As you can see, legacy quarterly and annual frequencies are business quarter +and business year ends. Please also note the legacy time rule for milliseconds +``ms`` versus the new offset alias for month start ``MS``. This means that +offset alias parsing is case sensitive. + +.. _timeseries.advanced_datetime: + +Time series-related instance methods +------------------------------------ + +Shifting / lagging +~~~~~~~~~~~~~~~~~~ + +One may want to *shift* or *lag* the values in a TimeSeries back and forward in +time. The method for this is ``shift``, which is available on all of the pandas +objects. In DataFrame, ``shift`` will currently only shift along the ``index`` +and in Panel along the ``major_axis``. + +.. ipython:: python + + ts = ts[:5] + ts.shift(1) + +The shift method accepts an ``freq`` argument which can accept a +``DateOffset`` class or other ``timedelta``-like object or also a :ref:`offset alias `: + +.. ipython:: python + + ts.shift(5, freq=datetools.bday) + ts.shift(5, freq='BM') + +Rather than changing the alignment of the data and the index, ``DataFrame`` and +``TimeSeries`` objects also have a ``tshift`` convenience method that changes +all the dates in the index by a specified number of offsets: + +.. ipython:: python + + ts.tshift(5, freq='D') + +Note that with ``tshift``, the leading entry is no longer NaN because the data +is not being realigned. + +Frequency conversion +~~~~~~~~~~~~~~~~~~~~ + +The primary function for changing frequencies is the ``asfreq`` function. +For a ``DatetimeIndex``, this is basically just a thin, but convenient wrapper +around ``reindex`` which generates a ``date_range`` and calls ``reindex``. + +.. ipython:: python + + dr = date_range('1/1/2010', periods=3, freq=3 * datetools.bday) + ts = Series(randn(3), index=dr) + ts + ts.asfreq(BDay()) + +``asfreq`` provides a further convenience so you can specify an interpolation +method for any gaps that may appear after the frequency conversion + +.. ipython:: python + + ts.asfreq(BDay(), method='pad') + +Filling forward / backward +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Related to ``asfreq`` and ``reindex`` is the ``fillna`` function documented in +the :ref:`missing data section `. + +.. _timeseries.resampling: + +Up- and downsampling +-------------------- + +With 0.8, pandas introduces simple, powerful, and efficient functionality for +performing resampling operations during frequency conversion (e.g., converting +secondly data into 5-minutely data). This is extremely common in, but not +limited to, financial applications. + +.. ipython:: python + + rng = date_range('1/1/2012', periods=100, freq='S') + + ts = Series(randint(0, 500, len(rng)), index=rng) + + ts.resample('5Min', how='sum') + +The ``resample`` function is very flexible and allows you to specify many +different parameters to control the frequency conversion and resampling +operation. + +The ``how`` parameter can be a function name or numpy array function that takes +and array and produces an aggregated values: + +.. ipython:: python + + ts.resample('5Min') # default is mean + + ts.resample('5Min', how='ohlc') + + ts.resample('5Min', how=np.max) + +Any function available via :ref:`dispatching ` can be given to +the ``how`` parameter by name, including ``sum``, ``mean``, ``std``, ``max``, +``min``, ``median``, ``first``, ``last``, ``ohlc``. + +For downsampling, ``closed`` can be set to 'left' or 'right' to specify which +end of the interval is closed: + +.. ipython:: python + + ts.resample('5Min', closed='right') + + ts.resample('5Min', closed='left') + +For upsampling, the ``fill_method`` and ``limit`` parameters can be specified +to interpolate over the gaps that are created: + +.. ipython:: python + + # from secondly to every 250 milliseconds + + ts[:2].resample('250L') + + ts[:2].resample('250L', fill_method='pad') + + ts[:2].resample('250L', fill_method='pad', limit=2) + +Parameters like ``label`` and ``loffset`` are used to manipulate the resulting +labels. ``label`` specifies whether the result is labeled with the beginning or +the end of the interval. ``loffset`` performs a time adjustment on the output +labels. + +.. ipython:: python + + ts.resample('5Min') # by default label='right' + + ts.resample('5Min', label='left') + + ts.resample('5Min', label='left', loffset='1s') + +The ``axis`` parameter can be set to 0 or 1 and allows you to resample the +specified axis for a DataFrame. + +``kind`` can be set to 'timestamp' or 'period' to convert the resulting index +to/from time-stamp and time-span representations. By default ``resample`` +retains the input representation. + +``convention`` can be set to 'start' or 'end' when resampling period data +(detail below). It specifies how low frequency periods are converted to higher +frequency periods. + +Note that 0.8 marks a watershed in the timeseries functionality in pandas. In +previous versions, resampling had to be done using a combination of +``date_range``, ``groupby`` with ``asof``, and then calling an aggregation +function on the grouped object. This was not nearly convenient or performant as +the new pandas timeseries API. + +.. _timeseries.periods: + +Time Span Representation +------------------------ + +Regular intervals of time are represented by ``Period`` objects in pandas while +sequences of ``Period`` objects are collected in a ``PeriodIndex``, which can +be created with the convenience function ``period_range``. + +Period +~~~~~~ +A ``Period`` represents a span of time (e.g., a day, a month, a quarter, etc). +It can be created using a frequency alias: + +.. ipython:: python + + Period('2012', freq='A-DEC') + + Period('2012-1-1', freq='D') + + Period('2012-1-1 19:00', freq='H') + +Unlike time stamped data, pandas does not support frequencies at multiples of +DateOffsets (e.g., '3Min') for periods. + +Adding and subtracting integers from periods shifts the period by its own +frequency. + +.. ipython:: python + + p = Period('2012', freq='A-DEC') + + p + 1 + + p - 3 + +Taking the difference of ``Period`` instances with the same frequency will +return the number of frequency units between them: + +.. ipython:: python + + Period('2012', freq='A-DEC') - Period(2002', freq='A-DEC') + +PeriodIndex and period_range +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Regular sequences of ``Period`` objects can be collected in a ``PeriodIndex``, +which can be constructed using the ``period_range`` convenience function: + +.. ipython:: python + + prng = period_range('1/1/2011', '1/1/2012', freq='M') + prng + +The ``PeriodIndex`` constructor can also be used directly: + +.. ipython:: python + + PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + +Just like ``DatetimeIndex``, a ``PeriodIndex`` can also be used to index pandas +objects: + +.. ipython:: python + + Series(randn(len(prng)), prng) + +Frequency Conversion and Resampling with PeriodIndex +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The frequency of Periods and PeriodIndex can be converted via the ``asfreq`` +method. Let's start with the fiscal year 2011, ending in December: + +.. ipython:: python + + p = Period('2011' freq='A-DEC') + p + +We can convert it to a monthly frequency. Using the ``how`` parameter, we can +specify whether to return the starting or ending month: + +.. ipython:: python + + p.asfreq('M', how='start') + + p.asfreq('M', how='end') + +The shorthands 's' and 'e' are provided for convenience: + +.. ipython:: python + + p.asfreq('M', 's') + p.asfreq('M', 'e') + +Converting to a "super-period" (e.g., annual frequency is a super-period of +quarterly frequency) automatically returns the super-period that includes the +input period: + +.. ipython:: python + + p = Period('2011-12', freq='M') + + p.asfreq('A-NOV') + +Note that since we converted to an annual frequency that ends the year in +November, the monthly period of December 2011 is actually in the 2012 A-NOV +period. + +.. _timeseries.quarterly: + +Period conversions with anchored frequencies are particularly useful for +working with various quarterly data common to economics, business, and other +fields. Many organizations define quarters relative to the month in which their +fiscal year start and ends. Thus, first quarter of 2011 could start in 2010 or +a few months into 2011. Via anchored frequencies, pandas works all quarterly +frequencies ``Q-JAN`` through ``Q-DEC``. + +``Q-DEC`` define regular calendar quarters: + +.. ipython:: python + + p = Period('2012Q1', freq='Q-DEC') + + p.asfreq('D', 's') + + p.asfreq('D', 'e') + +``Q-MAR`` defines fiscal year end in March: + +.. ipython:: python + + p = Period('2011Q4', freq='Q-MAR') + + p.asfreq('D', 's') + + p.asfreq('D', 'e') + +.. _timeseries.interchange: + +Converting between Representations +---------------------------------- + +Timestamped data can be converted to PeriodIndex-ed data using ``to_period`` +and vice-versa using ``to_timestamp``: + +.. ipython:: python + + rng = date_range('1/1/2012', periods=5, freq='M') + + ts = Series(randn(len(rng)), index=rng) + + ts + + ps = ts.to_period() + + ps + + ps.to_timestamp() + +Remember that 's' and 'e' can be used to return the timestamps at the start or +end of the period: + +.. ipython:: python + + ps.to_timestamp('D', how='s') + +Converting between period and timestamp enables some convenient arithmetic +functions to be used. In the following example, we convert a quarterly +frequency with year ending in November to 9am of the end of the month following +the quarter end: + +.. ipython:: python + + prng = period_range('1990Q1', '2000Q4', freq='Q-NOV') + + ts = Series(randn(len(prng)), prng) + + ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + + ts.head() + +.. _timeseries.timezone: + +Time Zone Handling +------------------ + +Using ``pytz``, pandas provides rich support for working with timestamps in +different time zones. By default, pandas objects are time zone unaware: + +.. ipython:: python + + rng = date_range('3/6/2012 00:00', periods=15, freq='D') + print(rng.tz) + +To supply the time zone, you can use the ``tz`` keyword to ``date_range`` and +other functions: + +.. ipython:: python + + rng_utc = date_range('3/6/2012 00:00', periods=10, freq='D', tz='UTC') + print(rng_utc.tz) + +Timestamps, like Python's ``datetime.datetime`` object can be either time zone +naive or time zone aware. Naive time series and DatetimeIndex objects can be +*localized* using ``tz_localize``: + +.. ipython:: python + + ts = Series(randn(len(rng)), rng) + + ts_utc = ts.tz_localize('UTC') + ts_utc + +You can use the ``tz_convert`` method to convert pandas objects to convert +tz-aware data to another time zone: + +.. ipython:: python + + ts_utc.tz_convert('US/Eastern') + +Under the hood, all timestamps are stored in UTC. Scalar values from a +``DatetimeIndex`` with a time zone will have their fields (day, hour, minute) +localized to the time zone. However, timestamps with the same UTC value are +still considered to be equal even if they are in different time zones: + +.. ipython:: python + + rng_eastern = rng_utc.tz_convert('US/Eastern') + rng_berlin = rng_utc.tz_convert('Europe/Berlin') + + rng_eastern[5] + rng_berlin[5] + rng_eastern[5] == rng_berlin[5] + +Like Series, DataFrame, and DatetimeIndex, Timestamps can be converted to other +time zones using ``tz_convert``: + +.. ipython:: python + + rng_eastern[5] + rng_berlin[5] + rng_eastern[5].tz_convert('Europe/Berlin') + +Localization of Timestamps functions just like DatetimeIndex and TimeSeries: + +.. ipython:: python + + rng[5] + rng[5].tz_localize('Asia/Shanghai') + + +Operations between TimeSeries in difficult time zones will yield UTC +TimeSeries, aligning the data on the UTC timestamps: + +.. ipython:: python + + eastern = ts_utc.tz_convert('US/Eastern') + berlin = ts_utc.tz_convert('Europe/Berlin') + result = eastern + berlin + result + result.index diff --git a/doc/source/v0.4.x.txt b/doc/source/v0.4.x.txt new file mode 100644 index 00000000..0fd7cc63 --- /dev/null +++ b/doc/source/v0.4.x.txt @@ -0,0 +1,77 @@ +.. _whatsnew_04x: + +v.0.4.3 through v0.4.1 (September 25 - October 9, 2011) +------------------------------------------------------- + +New Features +~~~~~~~~~~~~ + +- Added Python 3 support using 2to3 (PR200_) +- :ref:`Added ` ``name`` attribute to ``Series``, now + prints as part of ``Series.__repr__`` +- :ref:`Added ` instance methods ``isnull`` and ``notnull`` to + Series (PR209_, GH203_) +- :ref:`Added ` ``Series.align`` method for aligning two series + with choice of join method (ENH56_) +- :ref:`Added ` method ``get_level_values`` to + ``MultiIndex`` (IS188_) +- :ref:`Set ` values in mixed-type + ``DataFrame`` objects via ``.ix`` indexing attribute (GH135_) +- Added new ``DataFrame`` :ref:`methods ` + ``get_dtype_counts`` and property ``dtypes`` (ENHdc_) +- Added :ref:`ignore_index ` option to + ``DataFrame.append`` to stack DataFrames (ENH1b_) +- ``read_csv`` tries to :ref:`sniff ` delimiters using + ``csv.Sniffer`` (PR146_) +- ``read_csv`` can :ref:`read ` multiple columns into a + ``MultiIndex``; DataFrame's ``to_csv`` method writes out a corresponding + ``MultiIndex`` (PR151_) +- ``DataFrame.rename`` has a new ``copy`` parameter to :ref:`rename + ` a DataFrame in place (ENHed_) +- :ref:`Enable ` unstacking by name (PR142_) +- :ref:`Enable ` ``sortlevel`` to work by level (PR141_) + +Performance Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Altered binary operations on differently-indexed SparseSeries objects + to use the integer-based (dense) alignment logic which is faster with a + larger number of blocks (GH205_) +- Wrote faster Cython data alignment / merging routines resulting in + substantial speed increases +- Improved performance of ``isnull`` and ``notnull``, a regression from v0.3.0 + (GH187_) +- Refactored code related to ``DataFrame.join`` so that intermediate aligned + copies of the data in each ``DataFrame`` argument do not need to be created. + Substantial performance increases result (GH176_) +- Substantially improved performance of generic ``Index.intersection`` and + ``Index.union`` +- Implemented ``BlockManager.take`` resulting in significantly faster ``take`` + performance on mixed-type ``DataFrame`` objects (GH104_) +- Improved performance of ``Series.sort_index`` +- Significant groupby performance enhancement: removed unnecessary integrity + checks in DataFrame internals that were slowing down slicing operations to + retrieve groups +- Optimized ``_ensure_index`` function resulting in performance savings in + type-checking Index objects +- Wrote fast time series merging / joining methods in Cython. Will be + integrated later into DataFrame.join and related functions + +.. _PR146: https://github.com/pydata/pandas/pull/146 +.. _ENH1b: https://github.com/pydata/pandas/commit/1ba56251f0013ff7cd8834e9486cef2b10098371 +.. _ENHdc: https://github.com/pydata/pandas/commit/dca3c5c5a6a3769ee01465baca04cfdfa66a4f76 +.. _GH135: https://github.com/pydata/pandas/issues/135 +.. _PR151: https://github.com/pydata/pandas/pull/151 +.. _ENHed: https://github.com/pydata/pandas/commit/edd9f1945fc010a57fa0ae3b3444d1fffe592591 +.. _PR142: https://github.com/pydata/pandas/pull/142 +.. _PR141: https://github.com/pydata/pandas/pull/141 +.. _IS188: https://github.com/pydata/pandas/issues/188 +.. _ENH56: https://github.com/pydata/pandas/commit/56e0c9ffafac79ce262b55a6a13e1b10a88fbe93 +.. _GH187: https://github.com/pydata/pandas/issues/187 +.. _GH176: https://github.com/pydata/pandas/issues/176 +.. _GH104: https://github.com/pydata/pandas/issues/104 +.. _GH205: https://github.com/pydata/pandas/issues/205 +.. _PR209: https://github.com/pydata/pandas/pull/209 +.. _GH203: https://github.com/pydata/pandas/issues/203 +.. _PR200: https://github.com/pydata/pandas/pull/200 + diff --git a/doc/source/v0.5.0.txt b/doc/source/v0.5.0.txt new file mode 100644 index 00000000..017d10d4 --- /dev/null +++ b/doc/source/v0.5.0.txt @@ -0,0 +1,60 @@ + +.. _whatsnew_050: + +v.0.5.0 (October 24, 2011) +-------------------------- + +New Features +~~~~~~~~~~~~ + +- :ref:`Added ` ``DataFrame.align`` method with standard join options +- :ref:`Added ` ``parse_dates`` option to ``read_csv`` and ``read_table`` methods to optionally try to parse dates in the index columns +- :ref:`Added ` ``nrows``, ``chunksize``, and ``iterator`` arguments to ``read_csv`` and ``read_table``. The last two return a new ``TextParser`` class capable of lazily iterating through chunks of a flat file (GH242_) +- :ref:`Added ` ability to join on multiple columns in ``DataFrame.join`` (GH214_) +- Added private ``_get_duplicates`` function to ``Index`` for identifying duplicate values more easily (ENH5c_) +- :ref:`Added ` column attribute access to DataFrame. +- :ref:`Added ` Python tab completion hook for DataFrame columns. (PR233_, GH230_) +- :ref:`Implemented ` ``Series.describe`` for Series containing objects (PR241_) +- :ref:`Added ` inner join option to ``DataFrame.join`` when joining on key(s) (GH248_) +- :ref:`Implemented ` selecting DataFrame columns by passing a list to ``__getitem__`` (GH253_) +- :ref:`Implemented ` & and | to intersect / union Index objects, respectively (GH261_) +- :ref:`Added` ``pivot_table`` convenience function to pandas namespace (GH234_) +- :ref:`Implemented ` ``Panel.rename_axis`` function (GH243_) +- DataFrame will show index level names in console output (PR334_) +- :ref:`Implemented ` ``Panel.take`` +- :ref:`Added` ``set_eng_float_format`` for alternate DataFrame floating point string formatting (ENH61_) +- :ref:`Added ` convenience ``set_index`` function for creating a DataFrame index from its existing columns +- :ref:`Implemented ` ``groupby`` hierarchical index level name (GH223_) +- :ref:`Added ` support for different delimiters in ``DataFrame.to_csv`` (PR244_) +- TODO: DOCS ABOUT TAKE METHODS + +Performance Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- VBENCH Major performance improvements in file parsing functions ``read_csv`` and ``read_table`` +- VBENCH Added Cython function for converting tuples to ndarray very fast. Speeds up many MultiIndex-related operations +- VBENCH Refactored merging / joining code into a tidy class and disabled unnecessary computations in the float/object case, thus getting about 10% better performance (GH211_) +- VBENCH Improved speed of ``DataFrame.xs`` on mixed-type DataFrame objects by about 5x, regression from 0.3.0 (GH215_) +- VBENCH With new ``DataFrame.align`` method, speeding up binary operations between differently-indexed DataFrame objects by 10-25%. +- VBENCH Significantly sped up conversion of nested dict into DataFrame (GH212_) +- VBENCH Significantly speed up DataFrame ``__repr__`` and ``count`` on large mixed-type DataFrame objects + +.. _GH214: https://github.com/pydata/pandas/issues/214 +.. _GH248: https://github.com/pydata/pandas/issues/248 +.. _GH253: https://github.com/pydata/pandas/issues/253 +.. _GH261: https://github.com/pydata/pandas/issues/261 +.. _GH234: https://github.com/pydata/pandas/issues/234 +.. _GH243: https://github.com/pydata/pandas/issues/243 +.. _GH223: https://github.com/pydata/pandas/issues/223 +.. _PR244: https://github.com/pydata/pandas/pull/244 +.. _PR233: https://github.com/pydata/pandas/pull/233 +.. _GH230: https://github.com/pydata/pandas/issues/230 +.. _PR241: https://github.com/pydata/pandas/pull/241 +.. _GH242: https://github.com/pydata/pandas/issues/242 +.. _GH212: https://github.com/pydata/pandas/issues/212 +.. _GH211: https://github.com/pydata/pandas/issues/211 +.. _GH215: https://github.com/pydata/pandas/issues/215 +.. _GH213: https://github.com/pydata/pandas/issues/213 +.. _ENH61: https://github.com/pydata/pandas/commit/6141961 +.. _PR334: https://github.com/pydata/pandas/pull/334 +.. _ENH5c: https://github.com/pydata/pandas/commit/5ca6ff5d822ee4ddef1ec0d87b6d83d8b4bbd3eb diff --git a/doc/source/v0.6.0.txt b/doc/source/v0.6.0.txt new file mode 100644 index 00000000..e72aec60 --- /dev/null +++ b/doc/source/v0.6.0.txt @@ -0,0 +1,100 @@ +.. _whatsnew_060: + +v.0.6.0 (November 25, 2011) +--------------------------- + +New Features +~~~~~~~~~~~~ +- :ref:`Added ` ``melt`` function to ``pandas.core.reshape`` +- :ref:`Added ` ``level`` parameter to group by level in Series and DataFrame descriptive statistics (PR313_) +- :ref:`Added ` ``head`` and ``tail`` methods to Series, analogous to to DataFrame (PR296_) +- :ref:`Added ` ``Series.isin`` function which checks if each value is contained in a passed sequence (GH289_) +- :ref:`Added ` ``float_format`` option to ``Series.to_string`` +- :ref:`Added ` ``skip_footer`` (GH291_) and ``converters`` (GH343_) options to ``read_csv`` and ``read_table`` +- :ref:`Added ` ``drop_duplicates`` and ``duplicated`` functions for removing duplicate DataFrame rows and checking for duplicate rows, respectively (GH319_) +- :ref:`Implemented ` operators '&', '|', '^', '-' on DataFrame (GH347_) +- :ref:`Added ` ``Series.mad``, mean absolute deviation +- :ref:`Added ` ``QuarterEnd`` DateOffset (PR321_) +- :ref:`Added ` ``dot`` to DataFrame (GH65_) +- :ref:`Added ` ``orient`` option to ``Panel.from_dict`` (GH359_, GH301_) +- :ref:`Added ` ``orient`` option to ``DataFrame.from_dict`` +- :ref:`Added ` passing list of tuples or list of lists to ``DataFrame.from_records`` (GH357_) +- :ref:`Added ` multiple levels to groupby (GH103_) +- :ref:`Allow ` multiple columns in ``by`` argument of ``DataFrame.sort_index`` (GH92_, PR362_) +- :ref:`Added ` fast ``get_value`` and ``put_value`` methods to DataFrame (GH360_) +- :ref:`Added ` ``cov`` instance methods to Series and DataFrame (GH194_, PR362_) +- :ref:`Added ` ``kind='bar'`` option to ``DataFrame.plot`` (PR348_) +- :ref:`Added ` ``idxmin`` and ``idxmax`` to Series and DataFrame (PR286_) +- :ref:`Added ` ``read_clipboard`` function to parse DataFrame from clipboard (GH300_) +- :ref:`Added ` ``nunique`` function to Series for counting unique elements (GH297_) +- :ref:`Made ` DataFrame constructor use Series name if no columns passed (GH373_) +- :ref:`Support ` regular expressions in read_table/read_csv (GH364_) +- :ref:`Added ` ``DataFrame.to_html`` for writing DataFrame to HTML (PR387_) +- :ref:`Added ` support for MaskedArray data in DataFrame, masked values converted to NaN (PR396_) +- :ref:`Added ` ``DataFrame.boxplot`` function (GH368_) +- :ref:`Can ` pass extra args, kwds to DataFrame.apply (GH376_) +- :ref:`Implement ` ``DataFrame.join`` with vector ``on`` argument (GH312_) +- :ref:`Added ` ``legend`` boolean flag to ``DataFrame.plot`` (GH324_) +- :ref:`Can ` pass multiple levels to ``stack`` and ``unstack`` (GH370_) +- :ref:`Can ` pass multiple values columns to ``pivot_table`` (GH381_) +- :ref:`Use ` Series name in GroupBy for result index (GH363_) +- :ref:`Added ` ``raw`` option to ``DataFrame.apply`` for performance if only need ndarray (GH309_) +- Added proper, tested weighted least squares to standard and panel OLS (GH303_) + +Performance Enhancements +~~~~~~~~~~~~~~~~~~~~~~~~ +- VBENCH Cythonized ``cache_readonly``, resulting in substantial micro-performance enhancements throughout the codebase (GH361_) +- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than `np.apply_along_axis` (GH309_) +- VBENCH Improved performance of ``MultiIndex.from_tuples`` +- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations +- VBENCH + DOCUMENT Add ``raw`` option to ``DataFrame.apply`` for getting better performance when +- VBENCH Faster cythonized count by level in Series and DataFrame (GH341_) +- VBENCH? Significant GroupBy performance enhancement with multiple keys with many "empty" combinations +- VBENCH New Cython vectorized function ``map_infer`` speeds up ``Series.apply`` and ``Series.map`` significantly when passed elementwise Python function, motivated by (PR355_) +- VBENCH Significantly improved performance of ``Series.order``, which also makes np.unique called on a Series faster (GH327_) +- VBENCH Vastly improved performance of GroupBy on axes with a MultiIndex (GH299_) + +.. _GH65: https://github.com/pydata/pandas/issues/65 +.. _GH92: https://github.com/pydata/pandas/issues/92 +.. _GH103: https://github.com/pydata/pandas/issues/103 +.. _GH194: https://github.com/pydata/pandas/issues/194 +.. _GH289: https://github.com/pydata/pandas/issues/289 +.. _GH291: https://github.com/pydata/pandas/issues/291 +.. _GH297: https://github.com/pydata/pandas/issues/297 +.. _GH299: https://github.com/pydata/pandas/issues/299 +.. _GH300: https://github.com/pydata/pandas/issues/300 +.. _GH301: https://github.com/pydata/pandas/issues/301 +.. _GH303: https://github.com/pydata/pandas/issues/303 +.. _GH305: https://github.com/pydata/pandas/issues/305 +.. _GH308: https://github.com/pydata/pandas/issues/308 +.. _GH309: https://github.com/pydata/pandas/issues/309 +.. _GH312: https://github.com/pydata/pandas/issues/312 +.. _GH319: https://github.com/pydata/pandas/issues/319 +.. _GH324: https://github.com/pydata/pandas/issues/324 +.. _GH327: https://github.com/pydata/pandas/issues/327 +.. _GH341: https://github.com/pydata/pandas/issues/341 +.. _GH343: https://github.com/pydata/pandas/issues/343 +.. _GH347: https://github.com/pydata/pandas/issues/347 +.. _GH357: https://github.com/pydata/pandas/issues/357 +.. _GH359: https://github.com/pydata/pandas/issues/359 +.. _GH360: https://github.com/pydata/pandas/issues/360 +.. _GH361: https://github.com/pydata/pandas/issues/361 +.. _GH363: https://github.com/pydata/pandas/issues/363 +.. _GH364: https://github.com/pydata/pandas/issues/364 +.. _GH368: https://github.com/pydata/pandas/issues/368 +.. _GH370: https://github.com/pydata/pandas/issues/370 +.. _GH373: https://github.com/pydata/pandas/issues/373 +.. _GH376: https://github.com/pydata/pandas/issues/376 +.. _GH381: https://github.com/pydata/pandas/issues/381 +.. _GH382: https://github.com/pydata/pandas/issues/382 +.. _GH393: https://github.com/pydata/pandas/issues/393 +.. _PR286: https://github.com/pydata/pandas/pull/286 +.. _PR296: https://github.com/pydata/pandas/pull/296 +.. _PR313: https://github.com/pydata/pandas/pull/313 +.. _PR321: https://github.com/pydata/pandas/pull/321 +.. _PR348: https://github.com/pydata/pandas/pull/348 +.. _PR355: https://github.com/pydata/pandas/pull/355 +.. _PR362: https://github.com/pydata/pandas/pull/362 +.. _PR386: https://github.com/pydata/pandas/pull/386 +.. _PR387: https://github.com/pydata/pandas/pull/387 +.. _PR396: https://github.com/pydata/pandas/pull/396 diff --git a/doc/source/v0.6.1.txt b/doc/source/v0.6.1.txt new file mode 100644 index 00000000..b95e9f10 --- /dev/null +++ b/doc/source/v0.6.1.txt @@ -0,0 +1,62 @@ + +.. _whatsnew_061: + +v.0.6.1 (December 13, 2011) +--------------------------- + +New features +~~~~~~~~~~~~ +- Can :ref:`append single rows ` (as Series) to a DataFrame +- Add Spearman and Kendall rank :ref:`correlation ` + options to Series.corr and DataFrame.corr (GH428_) +- :ref:`Added ` ``get_value`` and ``set_value`` methods to + Series, DataFrame, and Panel for very low-overhead access (>2x faster in many + cases) to scalar elements (GH437_, GH438_). ``set_value`` is capable of + producing an enlarged object. +- Add PyQt table widget to sandbox (PR435_) +- DataFrame.align can :ref:`accept Series arguments ` + and an :ref:`axis option ` (GH461_) +- Implement new :ref:`SparseArray ` and :ref:`SparseList ` + data structures. SparseSeries now derives from SparseArray (GH463_) +- :ref:`Better console printing options ` (PR453_) +- Implement fast :ref:`data ranking ` for Series and + DataFrame, fast versions of scipy.stats.rankdata (GH428_) +- Implement :ref:`DataFrame.from_items ` alternate + constructor (GH444_) +- DataFrame.convert_objects method for :ref:`inferring better dtypes ` + for object columns (GH302_) +- Add :ref:`rolling_corr_pairwise ` function for + computing Panel of correlation matrices (GH189_) +- Add :ref:`margins ` option to :ref:`pivot_table + ` for computing subgroup aggregates (GH114_) +- Add ``Series.from_csv`` function (PR482_) +- :ref:`Can pass ` DataFrame/DataFrame and + DataFrame/Series to rolling_corr/rolling_cov (GH #462) +- MultiIndex.get_level_values can :ref:`accept the level name ` + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Improve memory usage of `DataFrame.describe` (do not copy data + unnecessarily) (PR #425) + +- Optimize scalar value lookups in the general case by 25% or more in Series + and DataFrame + +- Fix performance regression in cross-sectional count in DataFrame, affecting + DataFrame.dropna speed +- Column deletion in DataFrame copies no data (computes views on blocks) (GH + #158) + +.. _GH114: https://github.com/pydata/pandas/issues/114 +.. _GH189: https://github.com/pydata/pandas/issues/302 +.. _GH302: https://github.com/pydata/pandas/issues/302 +.. _GH428: https://github.com/pydata/pandas/issues/428 +.. _GH437: https://github.com/pydata/pandas/issues/437 +.. _GH438: https://github.com/pydata/pandas/issues/438 +.. _GH444: https://github.com/pydata/pandas/issues/444 +.. _GH461: https://github.com/pydata/pandas/issues/461 +.. _GH463: https://github.com/pydata/pandas/issues/463 +.. _PR435: https://github.com/pydata/pandas/pull/435 +.. _PR453: https://github.com/pydata/pandas/pull/453 +.. _PR482: https://github.com/pydata/pandas/pull/482 diff --git a/doc/source/v0.7.0.txt b/doc/source/v0.7.0.txt new file mode 100644 index 00000000..6ff748f1 --- /dev/null +++ b/doc/source/v0.7.0.txt @@ -0,0 +1,308 @@ +.. _whatsnew_0700: + +v.0.7.0 (February 9, 2012) +-------------------------- + +New features +~~~~~~~~~~~~ + +- New unified :ref:`merge function ` for efficiently performing + full gamut of database / relational-algebra operations. Refactored existing + join methods to use the new infrastructure, resulting in substantial + performance gains (GH220_, GH249_, GH267_) + +- New :ref:`unified concatenation function ` for concatenating + Series, DataFrame or Panel objects along an axis. Can form union or + intersection of the other axes. Improves performance of ``Series.append`` and + ``DataFrame.append`` (GH468_, GH479_, GH273_) + +- :ref:`Can ` pass multiple DataFrames to + `DataFrame.append` to concatenate (stack) and multiple Series to + ``Series.append`` too + +- :ref:`Can` pass list of dicts (e.g., a + list of JSON objects) to DataFrame constructor (GH526_) + +- You can now :ref:`set multiple columns ` in a + DataFrame via ``__getitem__``, useful for transformation (GH342_) + +- Handle differently-indexed output values in ``DataFrame.apply`` (GH498_) + +.. ipython:: python + + df = DataFrame(randn(10, 4)) + df.apply(lambda x: x.describe()) + +- :ref:`Add` ``reorder_levels`` method to Series and + DataFrame (PR534_) + +- :ref:`Add` dict-like ``get`` function to DataFrame + and Panel (PR521_) + +- :ref:`Add` ``DataFrame.iterrows`` method for efficiently + iterating through the rows of a DataFrame + +- :ref:`Add` ``DataFrame.to_panel`` with code adapted from + ``LongPanel.to_long`` + +- :ref:`Add ` ``reindex_axis`` method added to DataFrame + +- :ref:`Add ` ``level`` option to binary arithmetic functions on + ``DataFrame`` and ``Series`` + +- :ref:`Add ` ``level`` option to the ``reindex`` + and ``align`` methods on Series and DataFrame for broadcasting values across + a level (GH542_, PR552_, others) + +- :ref:`Add ` attribute-based item access to + ``Panel`` and add IPython completion (PR563_) + +- :ref:`Add ` ``logy`` option to ``Series.plot`` for + log-scaling on the Y axis + +- :ref:`Add ` ``index`` and ``header`` options to + ``DataFrame.to_string`` + +- :ref:`Can ` pass multiple DataFrames to + ``DataFrame.join`` to join on index (GH115_) + +- :ref:`Can ` pass multiple Panels to ``Panel.join`` + (GH115_) + +- :ref:`Added ` ``justify`` argument to ``DataFrame.to_string`` + to allow different alignment of column headers + +- :ref:`Add ` ``sort`` option to GroupBy to allow disabling + sorting of the group keys for potential speedups (GH595_) + +- :ref:`Can ` pass MaskedArray to Series + constructor (PR563_) + +- :ref:`Add ` Panel item access via attributes + and IPython completion (GH554_) + +- Implement ``DataFrame.lookup``, fancy-indexing analogue for retrieving values + given a sequence of row and column labels (GH338_) + +- Can pass a :ref:`list of functions ` to + aggregate with groupby on a DataFrame, yielding an aggregated result with + hierarchical columns (GH166_) + +- Can call ``cummin`` and ``cummax`` on Series and DataFrame to get cumulative + minimum and maximum, respectively (GH647_) + +- ``value_range`` added as utility function to get min and max of a dataframe + (GH288_) + +- Added ``encoding`` argument to ``read_csv``, ``read_table``, ``to_csv`` and + ``from_csv`` for non-ascii text (GH717_) + +- :ref:`Added ` ``abs`` method to pandas objects + +- :ref:`Added ` ``crosstab`` function for easily computing frequency tables + +- :ref:`Added ` ``isin`` method to index objects + +- :ref:`Added ` ``level`` argument to ``xs`` method of DataFrame. + + +API Changes to integer indexing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One of the potentially riskiest API changes in 0.7.0, but also one of the most +important, was a complete review of how **integer indexes** are handled with +regard to label-based indexing. Here is an example: + +.. ipython:: python + + s = Series(randn(10), index=range(0, 20, 2)) + s + s[0] + s[2] + s[4] + +This is all exactly identical to the behavior before. However, if you ask for a +key **not** contained in the Series, in versions 0.6.1 and prior, Series would +*fall back* on a location-based lookup. This now raises a ``KeyError``: + +.. code-block:: ipython + + In [2]: s[1] + KeyError: 1 + +This change also has the same impact on DataFrame: + +.. code-block:: ipython + + In [3]: df = DataFrame(randn(8, 4), index=range(0, 16, 2)) + + In [4]: df + 0 1 2 3 + 0 0.88427 0.3363 -0.1787 0.03162 + 2 0.14451 -0.1415 0.2504 0.58374 + 4 -1.44779 -0.9186 -1.4996 0.27163 + 6 -0.26598 -2.4184 -0.2658 0.11503 + 8 -0.58776 0.3144 -0.8566 0.61941 + 10 0.10940 -0.7175 -1.0108 0.47990 + 12 -1.16919 -0.3087 -0.6049 -0.43544 + 14 -0.07337 0.3410 0.0424 -0.16037 + + In [5]: df.ix[3] + KeyError: 3 + +In order to support purely integer-based indexing, the following methods have +been added: + +.. csv-table:: + :header: "Method","Description" + :widths: 40,60 + + ``Series.iget_value(i)``, Retrieve value stored at location ``i`` + ``Series.iget(i)``, Alias for ``iget_value`` + ``DataFrame.irow(i)``, Retrieve the ``i``-th row + ``DataFrame.icol(j)``, Retrieve the ``j``-th column + "``DataFrame.iget_value(i, j)``", Retrieve the value at row ``i`` and column ``j`` + +API tweaks regarding label-based slicing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Label-based slicing using ``ix`` now requires that the index be sorted +(monotonic) **unless** both the start and endpoint are contained in the index: + +.. ipython:: python + + s = Series(randn(6), index=list('gmkaec')) + s + +Then this is OK: + +.. ipython:: python + + s.ix['k':'e'] + +But this is not: + +.. code-block:: ipython + + In [12]: s.ix['b':'h'] + KeyError 'b' + +If the index had been sorted, the "range selection" would have been possible: + +.. ipython:: python + + s2 = s.sort_index() + s2 + s2.ix['b':'h'] + +Changes to Series ``[]`` operator +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As as notational convenience, you can pass a sequence of labels or a label +slice to a Series when getting and setting values via ``[]`` (i.e. the +``__getitem__`` and ``__setitem__`` methods). The behavior will be the same as +passing similar input to ``ix`` **except in the case of integer indexing**: + +.. ipython:: python + + s = Series(randn(6), index=list('acegkm')) + s + s[['m', 'a', 'c', 'e']] + s['b':'l'] + s['c':'k'] + +In the case of integer indexes, the behavior will be exactly as before +(shadowing ``ndarray``): + +.. ipython:: python + + s = Series(randn(6), index=range(0, 12, 2)) + s[[4, 0, 2]] + s[1:5] + +If you wish to do indexing with sequences and slicing on an integer index with +label semantics, use ``ix``. + +Other API Changes +~~~~~~~~~~~~~~~~~ + +- The deprecated ``LongPanel`` class has been completely removed + +- If ``Series.sort`` is called on a column of a DataFrame, an exception will + now be raised. Before it was possible to accidentally mutate a DataFrame's + column by doing ``df[col].sort()`` instead of the side-effect free method + ``df[col].order()`` (GH316_) + +- Miscellaneous renames and deprecations which will (harmlessly) raise + ``FutureWarning`` + +- ``drop`` added as an optional parameter to ``DataFrame.reset_index`` (GH699_) + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- :ref:`Cythonized GroupBy aggregations ` no longer + presort the data, thus achieving a significant speedup (GH93_). GroupBy + aggregations with Python functions significantly sped up by clever + manipulation of the ndarray data type in Cython (GH496_). +- Better error message in DataFrame constructor when passed column labels + don't match data (GH497_) +- Substantially improve performance of multi-GroupBy aggregation when a + Python function is passed, reuse ndarray object in Cython (GH496_) +- Can store objects indexed by tuples and floats in HDFStore (GH492_) +- Don't print length by default in Series.to_string, add `length` option (GH489_) +- Improve Cython code for multi-groupby to aggregate without having to sort + the data (GH93_) +- Improve MultiIndex reindexing speed by storing tuples in the MultiIndex, + test for backwards unpickling compatibility +- Improve column reindexing performance by using specialized Cython take + function +- Further performance tweaking of Series.__getitem__ for standard use cases +- Avoid Index dict creation in some cases (i.e. when getting slices, etc.), + regression from prior versions +- Friendlier error message in setup.py if NumPy not installed +- Use common set of NA-handling operations (sum, mean, etc.) in Panel class + also (GH536_) +- Default name assignment when calling ``reset_index`` on DataFrame with a + regular (non-hierarchical) index (GH476_) +- Use Cythonized groupers when possible in Series/DataFrame stat ops with + ``level`` parameter passed (GH545_) +- Ported skiplist data structure to C to speed up ``rolling_median`` by about + 5-10x in most typical use cases (GH374_) + +.. _GH115: https://github.com/pydata/pandas/issues/115 +.. _GH166: https://github.com/pydata/pandas/issues/166 +.. _GH220: https://github.com/pydata/pandas/issues/220 +.. _GH288: https://github.com/pydata/pandas/issues/288 +.. _GH249: https://github.com/pydata/pandas/issues/249 +.. _GH267: https://github.com/pydata/pandas/issues/267 +.. _GH273: https://github.com/pydata/pandas/issues/273 +.. _GH316: https://github.com/pydata/pandas/issues/316 +.. _GH338: https://github.com/pydata/pandas/issues/338 +.. _GH342: https://github.com/pydata/pandas/issues/342 +.. _GH374: https://github.com/pydata/pandas/issues/374 +.. _GH439: https://github.com/pydata/pandas/issues/439 +.. _GH468: https://github.com/pydata/pandas/issues/468 +.. _GH476: https://github.com/pydata/pandas/issues/476 +.. _GH479: https://github.com/pydata/pandas/issues/479 +.. _GH489: https://github.com/pydata/pandas/issues/489 +.. _GH492: https://github.com/pydata/pandas/issues/492 +.. _GH496: https://github.com/pydata/pandas/issues/496 +.. _GH497: https://github.com/pydata/pandas/issues/497 +.. _GH498: https://github.com/pydata/pandas/issues/498 +.. _GH526: https://github.com/pydata/pandas/issues/526 +.. _GH536: https://github.com/pydata/pandas/issues/536 +.. _GH542: https://github.com/pydata/pandas/issues/542 +.. _GH545: https://github.com/pydata/pandas/issues/545 +.. _GH554: https://github.com/pydata/pandas/issues/554 +.. _GH595: https://github.com/pydata/pandas/issues/595 +.. _GH647: https://github.com/pydata/pandas/issues/647 +.. _GH699: https://github.com/pydata/pandas/issues/699 +.. _GH717: https://github.com/pydata/pandas/issues/717 +.. _GH93: https://github.com/pydata/pandas/issues/93 +.. _GH93: https://github.com/pydata/pandas/issues/93 +.. _PR521: https://github.com/pydata/pandas/pull/521 +.. _PR534: https://github.com/pydata/pandas/pull/534 +.. _PR552: https://github.com/pydata/pandas/pull/552 +.. _PR554: https://github.com/pydata/pandas/pull/554 +.. _PR563: https://github.com/pydata/pandas/pull/563 diff --git a/doc/source/v0.7.1.txt b/doc/source/v0.7.1.txt new file mode 100644 index 00000000..181751eb --- /dev/null +++ b/doc/source/v0.7.1.txt @@ -0,0 +1,39 @@ +.. _whatsnew_0701: + +v.0.7.1 (February 29, 2012) +--------------------------- + +This release includes a few new features and addresses over a dozen bugs in +0.7.0. + +New features +~~~~~~~~~~~~ + + - Add ``to_clipboard`` function to pandas namespace for writing objects to + the system clipboard (GH774_) + - Add ``itertuples`` method to DataFrame for iterating through the rows of a + dataframe as tuples (GH818_) + - Add ability to pass fill_value and method to DataFrame and Series align + method (GH806_, GH807_) + - Add fill_value option to reindex, align methods (GH784_) + - Enable concat to produce DataFrame from Series (GH787_) + - Add ``between`` method to Series (GH802_) + - Add HTML representation hook to DataFrame for the IPython HTML notebook + (GH773_) + - Support for reading Excel 2007 XML documents using openpyxl + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + - Improve performance and memory usage of fillna on DataFrame + - Can concatenate a list of Series along axis=1 to obtain a DataFrame (GH787_) + +.. _GH774: https://github.com/pydata/pandas/issues/774 +.. _GH818: https://github.com/pydata/pandas/issues/818 +.. _GH806: https://github.com/pydata/pandas/issues/806 +.. _GH807: https://github.com/pydata/pandas/issues/807 +.. _GH784: https://github.com/pydata/pandas/issues/784 +.. _GH787: https://github.com/pydata/pandas/issues/787 +.. _GH802: https://github.com/pydata/pandas/issues/802 +.. _GH773: https://github.com/pydata/pandas/issues/773 +.. _GH787: https://github.com/pydata/pandas/issues/787 \ No newline at end of file diff --git a/doc/source/v0.7.2.txt b/doc/source/v0.7.2.txt new file mode 100644 index 00000000..04f7686e --- /dev/null +++ b/doc/source/v0.7.2.txt @@ -0,0 +1,38 @@ +.. _whatsnew_0702: + +v.0.7.2 (March 16, 2012) +--------------------------- + +This release targets bugs in 0.7.1, and adds a few minor features. + +New features +~~~~~~~~~~~~ + + - Add additional tie-breaking methods in DataFrame.rank (GH874_) + - Add ascending parameter to rank in Series, DataFrame (GH875_) + - Add coerce_float option to DataFrame.from_records (GH893_) + - Add sort_columns parameter to allow unsorted plots (GH918_) + - Enable column access via attributes on GroupBy (GH882_) + - Can pass dict of values to DataFrame.fillna (GH661_) + - Can select multiple hierarchical groups by passing list of values in .ix + (GH134_) + - Add ``axis`` option to DataFrame.fillna (GH174_) + - Add level keyword to ``drop`` for dropping values from a level (GH159_) + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + + - Use khash for Series.value_counts, add raw function to algorithms.py (GH861_) + - Intercept __builtin__.sum in groupby (GH885_) + +.. _GH134: https://github.com/pydata/pandas/issues/134 +.. _GH159: https://github.com/pydata/pandas/issues/159 +.. _GH174: https://github.com/pydata/pandas/issues/174 +.. _GH661: https://github.com/pydata/pandas/issues/661 +.. _GH874: https://github.com/pydata/pandas/issues/874 +.. _GH875: https://github.com/pydata/pandas/issues/875 +.. _GH893: https://github.com/pydata/pandas/issues/893 +.. _GH918: https://github.com/pydata/pandas/issues/918 +.. _GH882: https://github.com/pydata/pandas/issues/882 +.. _GH861: https://github.com/pydata/pandas/issues/861 +.. _GH885: https://github.com/pydata/pandas/issues/885 diff --git a/doc/source/v0.7.3.txt b/doc/source/v0.7.3.txt new file mode 100644 index 00000000..2bcfeae9 --- /dev/null +++ b/doc/source/v0.7.3.txt @@ -0,0 +1,96 @@ +.. _whatsnew_0703: + +v.0.7.3 (April 12, 2012) +------------------------ + +This is a minor release from 0.7.2 and fixes many minor bugs and adds a number +of nice new features. There are also a couple of API changes to note; these +should not affect very many users, and we are inclined to call them "bug fixes" +even though they do constitute a change in behavior. See the `full release +notes `__ or issue +tracker on GitHub for a complete list. + +New features +~~~~~~~~~~~~ + +- New :ref:`fixed width file reader `, ``read_fwf`` +- New :ref:`scatter_matrix ` function for making + a scatter plot matrix + +.. code-block:: python + + from pandas.tools.plotting import scatter_matrix + scatter_matrix(df, alpha=0.2) + +.. image:: _static/scatter_matrix_ex.png + :width: 5in + +- Add ``stacked`` argument to Series and DataFrame's ``plot`` method for + :ref:`stacked bar plots `. + +.. code-block:: python + + df.plot(kind='bar', stacked=True) + +.. image:: _static/bar_plot_stacked_ex.png + :width: 4in + +.. code-block:: python + + df.plot(kind='barh', stacked=True) + +.. image:: _static/barh_plot_stacked_ex.png + :width: 4in + +- Add log x and y :ref:`scaling options ` to + ``DataFrame.plot`` and ``Series.plot`` +- Add ``kurt`` methods to Series and DataFrame for computing kurtosis + + +NA Boolean Comparison API Change +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Reverted some changes to how NA values (represented typically as ``NaN`` or +``None``) are handled in non-numeric Series: + +.. ipython:: python + + series = Series(['Steve', np.nan, 'Joe']) + series == 'Steve' + series != 'Steve' + +In comparisons, NA / NaN will always come through as ``False`` except with +``!=`` which is ``True``. *Be very careful* with boolean arithmetic, especially +negation, in the presence of NA data. You may wish to add an explicit NA +filter into boolean array operations if you are worried about this: + +.. ipython:: python + + mask = series == 'Steve' + series[mask & series.notnull()] + +While propagating NA in comparisons may seem like the right behavior to some +users (and you could argue on purely technical grounds that this is the right +thing to do), the evaluation was made that propagating NA everywhere, including +in numerical arrays, would cause a large amount of problems for users. Thus, a +"practicality beats purity" approach was taken. This issue may be revisited at +some point in the future. + +Other API Changes +~~~~~~~~~~~~~~~~~ + +When calling ``apply`` on a grouped Series, the return value will also be a +Series, to be more consistent with the ``groupby`` behavior with DataFrame: + +.. ipython:: python + + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), 'D' : np.random.randn(8)}) + df + grouped = df.groupby('A')['C'] + grouped.describe() + grouped.apply(lambda x: x.order()[-2:]) # top 2 values + diff --git a/doc/source/v0.8.0.txt b/doc/source/v0.8.0.txt new file mode 100644 index 00000000..9df24810 --- /dev/null +++ b/doc/source/v0.8.0.txt @@ -0,0 +1,274 @@ +.. _whatsnew_080: + +v0.8.0 (TBD June, 2012) +------------------------ + +This is a major release from 0.7.3 and includes extensive work on the time +series handling and processing infrastructure as well as a great deal of new +functionality throughout the library. It includes over 700 commits from more +than 20 distinct authors. Most pandas 0.7.3 and earlier users should not +experience any issues upgrading, but due to the migration to the NumPy +datetime64 dtype, there may be a number of bugs and incompatibilities +lurking. Lingering incompatibilities will be fixed ASAP in a 0.8.1 release if +necessary. See the `full release notes +`__ or issue tracker +on GitHub for a complete list. + +Support for non-unique indexes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All objects can now work with non-unique indexes. Data alignment / join +operations work according to SQL join semantics (including, if application, +index duplication in many-to-many joins) + +NumPy datetime64 dtype and 1.6 dependency +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Time series data are now represented using NumPy's datetime64 dtype; thus, +pandas 0.8.0 now requires at least NumPy 1.6. It has been tested and verified +to work with the development version (1.7+) of NumPy as well which includes +some significant user-facing API changes. NumPy 1.6 also has a number of bugs +having to do with nanosecond resolution data, so I recommend that you steer +clear of NumPy 1.6's datetime64 API functions (though limited as they are) and +only interact with this data using the interface that pandas provides. + +See the end of the 0.8.0 section for a "porting" guide listing potential issues +for users migrating legacy codebases from pandas 0.7 or earlier to 0.8.0. + +Bug fixes to the 0.7.x series for legacy NumPy < 1.6 users will be provided as +they arise. There will be no more further development in 0.7.x beyond bug +fixes. + +Time series changes and improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + With this release, legacy scikits.timeseries users should be able to port + their code to use pandas. + +.. note:: + + See :ref:`documentation ` for overview of pandas timeseries API. + +- New datetime64 representation **speeds up join operations and data + alignment**, **reduces memory usage**, and improve serialization / + deserialization performance significantly over datetime.datetime +- High performance and flexible **resample** method for converting from + high-to-low and low-to-high frequency. Supports interpolation, user-defined + aggregation functions, and control over how the intervals and result labeling + are defined. A suite of high performance Cython/C-based resampling functions + (including Open-High-Low-Close) have also been implemented. +- Revamp of :ref:`frequency aliases ` and support for + **frequency shortcuts** like '15min', or '1h30min' +- New :ref:`DatetimeIndex class ` supports both fixed + frequency and irregular time + series. Replaces now deprecated DateRange class +- New ``PeriodIndex`` and ``Period`` classes for representing + :ref:`time spans ` and performing **calendar logic**, + including the `12 fiscal quarterly frequencies `. + This is a partial port of, and a substantial enhancement to, + elements of the scikits.timeseries codebase. Support for conversion between + PeriodIndex and DatetimeIndex +- New Timestamp data type subclasses `datetime.datetime`, providing the same + interface while enabling working with nanosecond-resolution data. Also + provides :ref:`easy time zone conversions `. +- Enhanced support for :ref:`time zones `. Add + `tz_convert` and ``tz_lcoalize`` methods to TimeSeries and DataFrame. All + timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time + zone set will be localized to localtime. Time zone conversions are therefore + essentially free. User needs to know very little about pytz library now; only + time zone names as as strings are required. Time zone-aware timestamps are + equal if and only if their UTC timestamps match. Operations between time + zone-aware time series with different time zones will result in a UTC-indexed + time series. +- Time series **string indexing conveniences** / shortcuts: slice years, year + and month, and index values with strings +- Enhanced time series **plotting**; adaptation of scikits.timeseries + matplotlib-based plotting code +- New ``date_range``, ``bdate_range``, and ``period_range`` :ref:`factory + functions ` +- Robust **frequency inference** function `infer_freq` and ``inferred_freq`` + property of DatetimeIndex, with option to infer frequency on construction of + DatetimeIndex +- to_datetime function efficiently **parses array of strings** to + DatetimeIndex. DatetimeIndex will parse array or list of strings to + datetime64 +- **Optimized** support for datetime64-dtype data in Series and DataFrame + columns +- New NaT (Not-a-Time) type to represent **NA** in timestamp arrays +- Optimize Series.asof for looking up **"as of" values** for arrays of + timestamps +- Milli, Micro, Nano date offset objects +- Can index time series with datetime.time objects to select all data at + particular **time of day** (``TimeSeries.at_time``) or **between two times** + (``TimeSeries.between_time``) +- Add :ref:`tshift ` method for leading/lagging + using the frequency (if any) of the index, as opposed to a naive lead/lag + using shift + +Other new features +~~~~~~~~~~~~~~~~~~ + +- New :ref:`cut ` and ``qcut`` functions (like R's cut + function) for computing a categorical variable from a continuous variable by + binning values either into value-based (``cut``) or quantile-based (``qcut``) + bins +- Rename ``Factor`` to ``Categorical`` and add a number of usability features +- Add :ref:`limit ` argument to fillna/reindex +- More flexible multiple function application in GroupBy, and can pass list + (name, function) tuples to get result in particular order with given names +- Add flexible :ref:`replace ` method for efficiently + substituting values +- Enhanced :ref:`read_csv/read_table ` for reading time series + data and converting multiple columns to dates +- Add :ref:`comments ` option to parser functions: read_csv, etc. +- Add :ref`dayfirst ` option to parser functions for parsing + international DD/MM/YYYY dates +- Allow the user to specify the CSV reader :ref:`dialect ` to + control quoting etc. +- Handling :ref:`thousands ` separators in read_csv to improve + integer parsing. +- Enable unstacking of multiple levels in one shot. Alleviate ``pivot_table`` + bugs (empty columns being introduced) +- Move to klib-based hash tables for indexing; better performance and less + memory usage than Python's dict +- Add first, last, min, max, and prod optimized GroupBy functions +- New :ref:`ordered_merge ` function +- Add flexible :ref:`comparison ` instance methods eq, ne, lt, + gt, etc. to DataFrame, Series +- Improve :ref:`scatter_matrix ` plotting + function and add histogram or kernel density estimates to diagonal +- Add :ref:`'kde' ` plot option for density plots +- Support for converting DataFrame to R data.frame through rpy2 +- Improved support for complex numbers in Series and DataFrame +- Add :ref:`pct_change ` method to all data structures +- Add max_colwidth configuration option for DataFrame console output +- :ref:`Interpolate ` Series values using index values +- Can select multiple columns from GroupBy +- Add :ref:`update ` methods to Series/DataFrame + for updating values in place +- Add ``any`` and ``all method to DataFrame + +New plotting methods +~~~~~~~~~~~~~~~~~~~~ + +.. ipython:: python + :suppress: + + import pandas as pd + fx = pd.load('data/fx_prices') + import matplotlib.pyplot as plt + +``Series.plot`` now supports a ``secondary_y`` option: + +.. ipython:: python + + plt.figure() + + fx['FR'].plot(style='g') + + @savefig whatsnew_secondary_y.png width=4.5in + fx['IT'].plot(style='k--', secondary_y=True) + +Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot +types. For example, ``'kde'`` is a new option: + +.. ipython:: python + + s = Series(np.concatenate((np.random.randn(1000), + np.random.randn(1000) * 0.5 + 3))) + plt.figure() + s.hist(normed=True, alpha=0.2) + @savefig whatsnew_kde.png width=4.5in + s.plot(kind='kde') + +See :ref:`the plotting page ` for much more. + +Other API changes +~~~~~~~~~~~~~~~~~ + +- Deprecation of ``offset``, ``time_rule``, and ``timeRule`` arguments names in + time series functions. Warnings will be printed until pandas 0.9 or 1.0. + +Potential porting issues for pandas <= 0.7.3 users +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The major change that may affect you in pandas 0.8.0 is that time series +indexes use NumPy's ``datetime64`` data type instead of ``dtype=object`` arrays +of Python's built-in ``datetime.datetime`` objects. ``DateRange`` has been +replaced by ``DatetimeIndex`` but otherwise behaved identically. But, if you +have code that converts ``DateRange`` or ``Index`` objects that used to contain +``datetime.datetime`` values to plain NumPy arrays, you may have bugs lurking +with code using scalar values because you are handing control over to NumPy: + +.. ipython:: python + + import datetime + rng = date_range('1/1/2000', periods=10) + rng[5] + isinstance(rng[5], datetime.datetime) + rng_asarray = np.asarray(rng) + scalar_val = rng_asarray[5] + type(scalar_val) + +pandas's ``Timestamp`` object is a subclass of ``datetime.datetime`` that has +nanosecond support (the ``nanosecond`` field store the nanosecond value between +0 and 999). It should substitute directly into any code that used +``datetime.datetime`` values before. Thus, I recommend not casting +``DatetimeIndex`` to regular NumPy arrays. + +If you have code that requires an array of ``datetime.datetime`` objects, you +have a couple of options. First, the ``asobject`` property of ``DatetimeIndex`` +produces an array of ``Timestamp`` objects: + +.. ipython:: python + + stamp_array = rng.asobject + stamp_array + stamp_array[5] + +To get an array of proper ``datetime.datetime`` objects, use the +``to_pydatetime`` method: + +.. ipython:: python + + dt_array = rng.to_pydatetime() + dt_array + dt_array[5] + +matplotlib knows how to handle ``datetime.datetime`` but not Timestamp +objects. While I recommend that you plot time series using ``TimeSeries.plot``, +you can either use ``to_pydatetime`` or register a converter for the Timestamp +type. See `matplotlib documentation +`__ for more on this. + +.. warning:: + + There are bugs in the user-facing API with the nanosecond datetime64 unit + in NumPy 1.6. In particular, the string version of the array shows garbage + values, and conversion to ``dtype=object`` is similarly broken. + + .. ipython:: python + + rng = date_range('1/1/2000', periods=10) + rng + np.asarray(rng) + converted = np.asarray(rng, dtype=object) + converted[5] + + **Trust me: don't panic**. If you are using NumPy 1.6 and restrict your + interaction with ``datetime64`` values to pandas's API you will be just + fine. There is nothing wrong with the data-type (a 64-bit integer + internally); all of the important data processing happens in pandas and is + heavily tested. I strongly recommend that you **do not work directly with + datetime64 arrays in NumPy 1.6** and only use the pandas API. + + +**Support for non-unique indexes**: In the latter case, you may have code +inside a ``try:... catch:`` block that failed due to the index not being +unique. In many cases it will no longer fail (some method like ``append`` still +check for uniqueness unless disabled). However, all is not lost: you can +inspect ``index.is_unique`` and raise an exception explicitly if it is +``False`` or go to a different code branch. + diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst new file mode 100644 index 00000000..eb15d7f7 --- /dev/null +++ b/doc/source/visualization.rst @@ -0,0 +1,343 @@ +.. currentmodule:: pandas +.. _visualization: + +.. ipython:: python + :suppress: + + import numpy as np + np.random.seed(123456) + from pandas import * + import pandas.util.testing as tm + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + import matplotlib.pyplot as plt + plt.close('all') + +************************ +Plotting with matplotlib +************************ + +.. note:: + + We intend to build more plotting integration with `matplotlib + `__ as time goes on. + +We use the standard convention for referencing the matplotlib API: + +.. ipython:: python + + import matplotlib.pyplot as plt + +.. _visualization.basic: + +Basic plotting: ``plot`` +------------------------ + +The ``plot`` method on Series and DataFrame is just a simple wrapper around +``plt.plot``: + +.. ipython:: python + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = ts.cumsum() + + @savefig series_plot_basic.png width=4.5in + ts.plot() + +If the index consists of dates, it calls ``gcf().autofmt_xdate()`` to try to +format the x-axis nicely as per above. The method takes a number of arguments +for controlling the look of the plot: + +.. ipython:: python + + @savefig series_plot_basic2.png width=4.5in + plt.figure(); ts.plot(style='k--', label='Series'); plt.legend() + +On DataFrame, ``plot`` is a convenience to plot all of the columns with labels: + +.. ipython:: python + + df = DataFrame(randn(1000, 4), index=ts.index, + columns=['A', 'B', 'C', 'D']) + df = df.cumsum() + + @savefig frame_plot_basic.png width=4.5in + plt.figure(); df.plot(); plt.legend(loc='best') + +You may set the ``legend`` argument to ``False`` to hide the legend, which is +shown by default. + +.. ipython:: python + + @savefig frame_plot_basic_noleg.png width=4.5in + df.plot(legend=False) + +Some other options are available, like plotting each Series on a different axis: + +.. ipython:: python + + @savefig frame_plot_subplots.png width=4.5in + df.plot(subplots=True, figsize=(8, 8)); plt.legend(loc='best') + +You may pass ``logy`` to get a log-scale Y axis. + +.. ipython:: python + + plt.figure(); + + ts = Series(randn(1000), index=date_range('1/1/2000', periods=1000)) + ts = np.exp(ts.cumsum()) + + @savefig series_plot_logy.png width=4.5in + ts.plot(logy=True) + +Plotting on a Secondary Y-axis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To plot data on a secondary y-axis, use the ``secondary_y`` keyword: + +.. ipython:: python + + plt.figure() + + df.A.plot() + + @savefig series_plot_secondary_y.png width=4.5in + df.B.plot(secondary_y=True, style='g') + + +Targeting different subplots +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can pass an ``ax`` argument to ``Series.plot`` to plot on a particular axis: + +.. ipython:: python + + fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 5)) + df['A'].plot(ax=axes[0,0]); axes[0,0].set_title('A') + df['B'].plot(ax=axes[0,1]); axes[0,1].set_title('B') + df['C'].plot(ax=axes[1,0]); axes[1,0].set_title('C') + + @savefig series_plot_multi.png width=4.5in + df['D'].plot(ax=axes[1,1]); axes[1,1].set_title('D') + +.. _visualization.other: + +Other plotting features +----------------------- + +.. _visualization.barplot: + +Bar plots +~~~~~~~~~ + +For labeled, non-time series data, you may wish to produce a bar plot: + +.. ipython:: python + + plt.figure(); + + @savefig bar_plot_ex.png width=4.5in + df.ix[5].plot(kind='bar'); plt.axhline(0, color='k') + +Calling a DataFrame's ``plot`` method with ``kind='bar'`` produces a multiple +bar plot: + +.. ipython:: python + :suppress: + + plt.figure(); + +.. ipython:: python + + df2 = DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + + @savefig bar_plot_multi_ex.png width=5in + df2.plot(kind='bar'); + +To produce a stacked bar plot, pass ``stacked=True``: + +.. ipython:: python + :suppress: + + plt.figure(); + +.. ipython:: python + + @savefig bar_plot_stacked_ex.png width=5in + df2.plot(kind='bar', stacked=True); + +To get horizontal bar plots, pass ``kind='barh'``: + +.. ipython:: python + :suppress: + + plt.figure(); + +.. ipython:: python + + @savefig barh_plot_stacked_ex.png width=5in + df2.plot(kind='barh', stacked=True); + +Histograms +~~~~~~~~~~ +.. ipython:: python + + plt.figure(); + + @savefig hist_plot_ex.png width=4.5in + df['A'].diff().hist() + +For a DataFrame, ``hist`` plots the histograms of the columns on multiple +subplots: + +.. ipython:: python + + plt.figure() + + @savefig frame_hist_ex.png width=4.5in + df.diff().hist(color='k', alpha=0.5, bins=50) + +.. _visualization.box: + +Box-Plotting +~~~~~~~~~~~~ + +DataFrame has a ``boxplot`` method which allows you to visualize the +distribution of values within each column. + +For instance, here is a boxplot representing five trials of 10 observations of +a uniform random variable on [0,1). + +.. ipython:: python + + df = DataFrame(np.random.rand(10,5)) + plt.figure(); + + @savefig box_plot_ex.png width=4.5in + bp = df.boxplot() + +You can create a stratified boxplot using the ``by`` keyword argument to create +groupings. For instance, + +.. ipython:: python + + df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + + plt.figure(); + + @savefig box_plot_ex2.png width=4.5in + bp = df.boxplot(by='X') + +You can also pass a subset of columns to plot, as well as group by multiple +columns: + +.. ipython:: python + + df = DataFrame(np.random.rand(10,3), columns=['Col1', 'Col2', 'Col3']) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + df['Y'] = Series(['A','B','A','B','A','B','A','B','A','B']) + + plt.figure(); + + @savefig box_plot_ex3.png width=4.5in + bp = df.boxplot(column=['Col1','Col2'], by=['X','Y']) + +.. _visualization.scatter_matrix: + +Scatter plot matrix +~~~~~~~~~~~~~~~~~~~ + +*New in 0.7.3.* You can create a scatter plot matrix using the + ``scatter_matrix`` method in ``pandas.tools.plotting``: + +.. ipython:: python + + from pandas.tools.plotting import scatter_matrix + df = DataFrame(np.random.randn(1000, 4), columns=['a', 'b', 'c', 'd']) + + @savefig scatter_matrix_kde.png width=6in + scatter_matrix(df, alpha=0.2, figsize=(8, 8), diagonal='kde') + +.. _visualization.kde: + +*New in 0.8.0* You can create density plots using the Series/DataFrame.plot and +setting `kind='kde'`: + +.. ipython:: python + :suppress: + + plt.figure(); + +.. ipython:: python + + ser = Series(np.random.randn(1000)) + + @savefig kde_plot.png width=6in + ser.plot(kind='kde') + +.. _visualization.andrews_curves: + +Andrews Curves +~~~~~~~~~~~~~~ + +Andrews curves allow one to plot multivariate data as a large number +of curves that are created using the attributes of samples as coefficients +for Fourier series. By coloring these curves differently for each class +it is possible to visualize data clustering. Curves belonging to samples +of the same class will usually be closer together and form larger structures. + +.. ipython:: python + + from pandas import read_csv + from pandas.tools.plotting import andrews_curves + + data = read_csv('data/iris.data') + + plt.figure() + + @savefig andrews_curves.png width=6in + andrews_curves(data, 'Name') + +Lag Plot +~~~~~~~~ + +Lag plots are used to check if a data set or time series is random. Random +data should not exhibit any structure in the lag plot. Non-random structure +implies that the underlying data are not random. + +.. ipython:: python + + from pandas.tools.plotting import lag_plot + + plt.figure() + + data = Series(0.1 * np.random.random(1000) + + 0.9 * np.sin(np.linspace(-99 * np.pi, 99 * np.pi, num=1000))) + + @savefig lag_plot.png width=6in + lag_plot(data) + +Autocorrelation Plot +~~~~~~~~~~~~~~~~~~~~ + +Autocorrelation plots are often used for checking randomness in time series. +This is done by computing autocorrelations for data values at varying time lags. +If time series is random, such autocorrelations should be near zero for any and +all time-lag separations. If time series is non-random then one or more of the +autocorrelations will be significantly non-zero. The horizontal lines displayed +in the plot correspond to 95% and 99% confidence bands. The dashed line is 99% +confidence band. + +.. ipython:: python + + from pandas.tools.plotting import autocorrelation_plot + + plt.figure() + + data = Series(0.7 * np.random.random(1000) + + 0.3 * np.sin(np.linspace(-9 * np.pi, 9 * np.pi, num=1000))) + + @savefig autocorrelation_plot.png width=6in + autocorrelation_plot(data) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst new file mode 100644 index 00000000..e60baa37 --- /dev/null +++ b/doc/source/whatsnew.rst @@ -0,0 +1,36 @@ +.. _whatsnew: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + from pandas import * + randn = np.random.randn + np.set_printoptions(precision=4, suppress=True) + +********** +What's New +********** + +These are new features and improvements of note in each release. + +.. include:: v0.8.0.txt + +.. include:: v0.7.3.txt + +.. include:: v0.7.2.txt + +.. include:: v0.7.1.txt + +.. include:: v0.7.0.txt + +.. include:: v0.6.1.txt + +.. include:: v0.6.0.txt + +.. include:: v0.5.0.txt + +.. include:: v0.4.x.txt + diff --git a/doc/sphinxext/LICENSE.txt b/doc/sphinxext/LICENSE.txt new file mode 100755 index 00000000..e00efc31 --- /dev/null +++ b/doc/sphinxext/LICENSE.txt @@ -0,0 +1,97 @@ +------------------------------------------------------------------------------- + The files + - numpydoc.py + - autosummary.py + - autosummary_generate.py + - docscrape.py + - docscrape_sphinx.py + - phantom_import.py + have the following license: + +Copyright (C) 2008 Stefan van der Walt , Pauli Virtanen + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + +THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + The files + - compiler_unparse.py + - comment_eater.py + - traitsdoc.py + have the following license: + +This software is OSI Certified Open Source Software. +OSI Certified is a certification mark of the Open Source Initiative. + +Copyright (c) 2006, Enthought, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Enthought, Inc. nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +------------------------------------------------------------------------------- + The files + - only_directives.py + - plot_directive.py + originate from Matplotlib (http://matplotlib.sf.net/) which has + the following license: + +Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved. + +1. This LICENSE AGREEMENT is between John D. Hunter (“JDH”), and the Individual or Organization (“Licensee”) accessing and otherwise using matplotlib software in source or binary form and its associated documentation. + +2. Subject to the terms and conditions of this License Agreement, JDH hereby grants Licensee a nonexclusive, royalty-free, world-wide license to reproduce, analyze, test, perform and/or display publicly, prepare derivative works, distribute, and otherwise use matplotlib 0.98.3 alone or in any derivative version, provided, however, that JDH’s License Agreement and JDH’s notice of copyright, i.e., “Copyright (c) 2002-2008 John D. Hunter; All Rights Reserved” are retained in matplotlib 0.98.3 alone or in any derivative version prepared by Licensee. + +3. In the event Licensee prepares a derivative work that is based on or incorporates matplotlib 0.98.3 or any part thereof, and wants to make the derivative work available to others as provided herein, then Licensee hereby agrees to include in any such work a brief summary of the changes made to matplotlib 0.98.3. + +4. JDH is making matplotlib 0.98.3 available to Licensee on an “AS IS” basis. JDH MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED. BY WAY OF EXAMPLE, BUT NOT LIMITATION, JDH MAKES NO AND DISCLAIMS ANY REPRESENTATION OR WARRANTY OF MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF MATPLOTLIB 0.98.3 WILL NOT INFRINGE ANY THIRD PARTY RIGHTS. + +5. JDH SHALL NOT BE LIABLE TO LICENSEE OR ANY OTHER USERS OF MATPLOTLIB 0.98.3 FOR ANY INCIDENTAL, SPECIAL, OR CONSEQUENTIAL DAMAGES OR LOSS AS A RESULT OF MODIFYING, DISTRIBUTING, OR OTHERWISE USING MATPLOTLIB 0.98.3, OR ANY DERIVATIVE THEREOF, EVEN IF ADVISED OF THE POSSIBILITY THEREOF. + +6. This License Agreement will automatically terminate upon a material breach of its terms and conditions. + +7. Nothing in this License Agreement shall be deemed to create any relationship of agency, partnership, or joint venture between JDH and Licensee. This License Agreement does not grant permission to use JDH trademarks or trade name in a trademark sense to endorse or promote products or services of Licensee, or any third party. + +8. By copying, installing or otherwise using matplotlib 0.98.3, Licensee agrees to be bound by the terms and conditions of this License Agreement. + diff --git a/doc/sphinxext/MANIFEST.in b/doc/sphinxext/MANIFEST.in new file mode 100755 index 00000000..f88ed785 --- /dev/null +++ b/doc/sphinxext/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include tests *.py +include *.txt diff --git a/doc/sphinxext/README.txt b/doc/sphinxext/README.txt new file mode 100755 index 00000000..f3d782c9 --- /dev/null +++ b/doc/sphinxext/README.txt @@ -0,0 +1,52 @@ +===================================== +numpydoc -- Numpy's Sphinx extensions +===================================== + +Numpy's documentation uses several custom extensions to Sphinx. These +are shipped in this ``numpydoc`` package, in case you want to make use +of them in third-party projects. + +The following extensions are available: + + - ``numpydoc``: support for the Numpy docstring format in Sphinx, and add + the code description directives ``np:function``, ``np-c:function``, etc. + that support the Numpy docstring syntax. + + - ``numpydoc.traitsdoc``: For gathering documentation about Traits attributes. + + - ``numpydoc.plot_directives``: Adaptation of Matplotlib's ``plot::`` + directive. Note that this implementation may still undergo severe + changes or eventually be deprecated. + + - ``numpydoc.only_directives``: (DEPRECATED) + + - ``numpydoc.autosummary``: (DEPRECATED) An ``autosummary::`` directive. + Available in Sphinx 0.6.2 and (to-be) 1.0 as ``sphinx.ext.autosummary``, + and it the Sphinx 1.0 version is recommended over that included in + Numpydoc. + + +numpydoc +======== + +Numpydoc inserts a hook into Sphinx's autodoc that converts docstrings +following the Numpy/Scipy format to a form palatable to Sphinx. + +Options +------- + +The following options can be set in conf.py: + +- numpydoc_use_plots: bool + + Whether to produce ``plot::`` directives for Examples sections that + contain ``import matplotlib``. + +- numpydoc_show_class_members: bool + + Whether to show all members of a class in the Methods and Attributes + sections automatically. + +- numpydoc_edit_link: bool (DEPRECATED -- edit your HTML template instead) + + Whether to insert an edit link after docstrings. diff --git a/doc/sphinxext/__init__.py b/doc/sphinxext/__init__.py new file mode 100755 index 00000000..ae9073bc --- /dev/null +++ b/doc/sphinxext/__init__.py @@ -0,0 +1 @@ +from numpydoc import setup diff --git a/doc/sphinxext/comment_eater.py b/doc/sphinxext/comment_eater.py new file mode 100755 index 00000000..e11eea90 --- /dev/null +++ b/doc/sphinxext/comment_eater.py @@ -0,0 +1,158 @@ +from cStringIO import StringIO +import compiler +import inspect +import textwrap +import tokenize + +from compiler_unparse import unparse + + +class Comment(object): + """ A comment block. + """ + is_comment = True + def __init__(self, start_lineno, end_lineno, text): + # int : The first line number in the block. 1-indexed. + self.start_lineno = start_lineno + # int : The last line number. Inclusive! + self.end_lineno = end_lineno + # str : The text block including '#' character but not any leading spaces. + self.text = text + + def add(self, string, start, end, line): + """ Add a new comment line. + """ + self.start_lineno = min(self.start_lineno, start[0]) + self.end_lineno = max(self.end_lineno, end[0]) + self.text += string + + def __repr__(self): + return '%s(%r, %r, %r)' % (self.__class__.__name__, self.start_lineno, + self.end_lineno, self.text) + + +class NonComment(object): + """ A non-comment block of code. + """ + is_comment = False + def __init__(self, start_lineno, end_lineno): + self.start_lineno = start_lineno + self.end_lineno = end_lineno + + def add(self, string, start, end, line): + """ Add lines to the block. + """ + if string.strip(): + # Only add if not entirely whitespace. + self.start_lineno = min(self.start_lineno, start[0]) + self.end_lineno = max(self.end_lineno, end[0]) + + def __repr__(self): + return '%s(%r, %r)' % (self.__class__.__name__, self.start_lineno, + self.end_lineno) + + +class CommentBlocker(object): + """ Pull out contiguous comment blocks. + """ + def __init__(self): + # Start with a dummy. + self.current_block = NonComment(0, 0) + + # All of the blocks seen so far. + self.blocks = [] + + # The index mapping lines of code to their associated comment blocks. + self.index = {} + + def process_file(self, file): + """ Process a file object. + """ + for token in tokenize.generate_tokens(file.next): + self.process_token(*token) + self.make_index() + + def process_token(self, kind, string, start, end, line): + """ Process a single token. + """ + if self.current_block.is_comment: + if kind == tokenize.COMMENT: + self.current_block.add(string, start, end, line) + else: + self.new_noncomment(start[0], end[0]) + else: + if kind == tokenize.COMMENT: + self.new_comment(string, start, end, line) + else: + self.current_block.add(string, start, end, line) + + def new_noncomment(self, start_lineno, end_lineno): + """ We are transitioning from a noncomment to a comment. + """ + block = NonComment(start_lineno, end_lineno) + self.blocks.append(block) + self.current_block = block + + def new_comment(self, string, start, end, line): + """ Possibly add a new comment. + + Only adds a new comment if this comment is the only thing on the line. + Otherwise, it extends the noncomment block. + """ + prefix = line[:start[1]] + if prefix.strip(): + # Oops! Trailing comment, not a comment block. + self.current_block.add(string, start, end, line) + else: + # A comment block. + block = Comment(start[0], end[0], string) + self.blocks.append(block) + self.current_block = block + + def make_index(self): + """ Make the index mapping lines of actual code to their associated + prefix comments. + """ + for prev, block in zip(self.blocks[:-1], self.blocks[1:]): + if not block.is_comment: + self.index[block.start_lineno] = prev + + def search_for_comment(self, lineno, default=None): + """ Find the comment block just before the given line number. + + Returns None (or the specified default) if there is no such block. + """ + if not self.index: + self.make_index() + block = self.index.get(lineno, None) + text = getattr(block, 'text', default) + return text + + +def strip_comment_marker(text): + """ Strip # markers at the front of a block of comment text. + """ + lines = [] + for line in text.splitlines(): + lines.append(line.lstrip('#')) + text = textwrap.dedent('\n'.join(lines)) + return text + + +def get_class_traits(klass): + """ Yield all of the documentation for trait definitions on a class object. + """ + # FIXME: gracefully handle errors here or in the caller? + source = inspect.getsource(klass) + cb = CommentBlocker() + cb.process_file(StringIO(source)) + mod_ast = compiler.parse(source) + class_ast = mod_ast.node.nodes[0] + for node in class_ast.code.nodes: + # FIXME: handle other kinds of assignments? + if isinstance(node, compiler.ast.Assign): + name = node.nodes[0].name + rhs = unparse(node.expr).strip() + doc = strip_comment_marker(cb.search_for_comment(node.lineno, default='')) + yield name, rhs, doc + diff --git a/doc/sphinxext/compiler_unparse.py b/doc/sphinxext/compiler_unparse.py new file mode 100755 index 00000000..ffcf51b3 --- /dev/null +++ b/doc/sphinxext/compiler_unparse.py @@ -0,0 +1,860 @@ +""" Turn compiler.ast structures back into executable python code. + + The unparse method takes a compiler.ast tree and transforms it back into + valid python code. It is incomplete and currently only works for + import statements, function calls, function definitions, assignments, and + basic expressions. + + Inspired by python-2.5-svn/Demo/parser/unparse.py + + fixme: We may want to move to using _ast trees because the compiler for + them is about 6 times faster than compiler.compile. +""" + +import sys +import cStringIO +from compiler.ast import Const, Name, Tuple, Div, Mul, Sub, Add + +def unparse(ast, single_line_functions=False): + s = cStringIO.StringIO() + UnparseCompilerAst(ast, s, single_line_functions) + return s.getvalue().lstrip() + +op_precedence = { 'compiler.ast.Power':3, 'compiler.ast.Mul':2, 'compiler.ast.Div':2, + 'compiler.ast.Add':1, 'compiler.ast.Sub':1 } + +class UnparseCompilerAst: + """ Methods in this class recursively traverse an AST and + output source code for the abstract syntax; original formatting + is disregarged. + """ + + ######################################################################### + # object interface. + ######################################################################### + + def __init__(self, tree, file = sys.stdout, single_line_functions=False): + """ Unparser(tree, file=sys.stdout) -> None. + + Print the source for tree to file. + """ + self.f = file + self._single_func = single_line_functions + self._do_indent = True + self._indent = 0 + self._dispatch(tree) + self._write("\n") + self.f.flush() + + ######################################################################### + # Unparser private interface. + ######################################################################### + + ### format, output, and dispatch methods ################################ + + def _fill(self, text = ""): + "Indent a piece of text, according to the current indentation level" + if self._do_indent: + self._write("\n"+" "*self._indent + text) + else: + self._write(text) + + def _write(self, text): + "Append a piece of text to the current line." + self.f.write(text) + + def _enter(self): + "Print ':', and increase the indentation." + self._write(": ") + self._indent += 1 + + def _leave(self): + "Decrease the indentation level." + self._indent -= 1 + + def _dispatch(self, tree): + "_dispatcher function, _dispatching tree type T to method _T." + if isinstance(tree, list): + for t in tree: + self._dispatch(t) + return + meth = getattr(self, "_"+tree.__class__.__name__) + if tree.__class__.__name__ == 'NoneType' and not self._do_indent: + return + meth(tree) + + + ######################################################################### + # compiler.ast unparsing methods. + # + # There should be one method per concrete grammar type. They are + # organized in alphabetical order. + ######################################################################### + + def _Add(self, t): + self.__binary_op(t, '+') + + def _And(self, t): + self._write(" (") + for i, node in enumerate(t.nodes): + self._dispatch(node) + if i != len(t.nodes)-1: + self._write(") and (") + self._write(")") + + def _AssAttr(self, t): + """ Handle assigning an attribute of an object + """ + self._dispatch(t.expr) + self._write('.'+t.attrname) + + def _Assign(self, t): + """ Expression Assignment such as "a = 1". + + This only handles assignment in expressions. Keyword assignment + is handled separately. + """ + self._fill() + for target in t.nodes: + self._dispatch(target) + self._write(" = ") + self._dispatch(t.expr) + if not self._do_indent: + self._write('; ') + + def _AssName(self, t): + """ Name on left hand side of expression. + + Treat just like a name on the right side of an expression. + """ + self._Name(t) + + def _AssTuple(self, t): + """ Tuple on left hand side of an expression. + """ + + # _write each elements, separated by a comma. + for element in t.nodes[:-1]: + self._dispatch(element) + self._write(", ") + + # Handle the last one without writing comma + last_element = t.nodes[-1] + self._dispatch(last_element) + + def _AugAssign(self, t): + """ +=,-=,*=,/=,**=, etc. operations + """ + + self._fill() + self._dispatch(t.node) + self._write(' '+t.op+' ') + self._dispatch(t.expr) + if not self._do_indent: + self._write(';') + + def _Bitand(self, t): + """ Bit and operation. + """ + + for i, node in enumerate(t.nodes): + self._write("(") + self._dispatch(node) + self._write(")") + if i != len(t.nodes)-1: + self._write(" & ") + + def _Bitor(self, t): + """ Bit or operation + """ + + for i, node in enumerate(t.nodes): + self._write("(") + self._dispatch(node) + self._write(")") + if i != len(t.nodes)-1: + self._write(" | ") + + def _CallFunc(self, t): + """ Function call. + """ + self._dispatch(t.node) + self._write("(") + comma = False + for e in t.args: + if comma: self._write(", ") + else: comma = True + self._dispatch(e) + if t.star_args: + if comma: self._write(", ") + else: comma = True + self._write("*") + self._dispatch(t.star_args) + if t.dstar_args: + if comma: self._write(", ") + else: comma = True + self._write("**") + self._dispatch(t.dstar_args) + self._write(")") + + def _Compare(self, t): + self._dispatch(t.expr) + for op, expr in t.ops: + self._write(" " + op + " ") + self._dispatch(expr) + + def _Const(self, t): + """ A constant value such as an integer value, 3, or a string, "hello". + """ + self._dispatch(t.value) + + def _Decorators(self, t): + """ Handle function decorators (eg. @has_units) + """ + for node in t.nodes: + self._dispatch(node) + + def _Dict(self, t): + self._write("{") + for i, (k, v) in enumerate(t.items): + self._dispatch(k) + self._write(": ") + self._dispatch(v) + if i < len(t.items)-1: + self._write(", ") + self._write("}") + + def _Discard(self, t): + """ Node for when return value is ignored such as in "foo(a)". + """ + self._fill() + self._dispatch(t.expr) + + def _Div(self, t): + self.__binary_op(t, '/') + + def _Ellipsis(self, t): + self._write("...") + + def _From(self, t): + """ Handle "from xyz import foo, bar as baz". + """ + # fixme: Are From and ImportFrom handled differently? + self._fill("from ") + self._write(t.modname) + self._write(" import ") + for i, (name,asname) in enumerate(t.names): + if i != 0: + self._write(", ") + self._write(name) + if asname is not None: + self._write(" as "+asname) + + def _Function(self, t): + """ Handle function definitions + """ + if t.decorators is not None: + self._fill("@") + self._dispatch(t.decorators) + self._fill("def "+t.name + "(") + defaults = [None] * (len(t.argnames) - len(t.defaults)) + list(t.defaults) + for i, arg in enumerate(zip(t.argnames, defaults)): + self._write(arg[0]) + if arg[1] is not None: + self._write('=') + self._dispatch(arg[1]) + if i < len(t.argnames)-1: + self._write(', ') + self._write(")") + if self._single_func: + self._do_indent = False + self._enter() + self._dispatch(t.code) + self._leave() + self._do_indent = True + + def _Getattr(self, t): + """ Handle getting an attribute of an object + """ + if isinstance(t.expr, (Div, Mul, Sub, Add)): + self._write('(') + self._dispatch(t.expr) + self._write(')') + else: + self._dispatch(t.expr) + + self._write('.'+t.attrname) + + def _If(self, t): + self._fill() + + for i, (compare,code) in enumerate(t.tests): + if i == 0: + self._write("if ") + else: + self._write("elif ") + self._dispatch(compare) + self._enter() + self._fill() + self._dispatch(code) + self._leave() + self._write("\n") + + if t.else_ is not None: + self._write("else") + self._enter() + self._fill() + self._dispatch(t.else_) + self._leave() + self._write("\n") + + def _IfExp(self, t): + self._dispatch(t.then) + self._write(" if ") + self._dispatch(t.test) + + if t.else_ is not None: + self._write(" else (") + self._dispatch(t.else_) + self._write(")") + + def _Import(self, t): + """ Handle "import xyz.foo". + """ + self._fill("import ") + + for i, (name,asname) in enumerate(t.names): + if i != 0: + self._write(", ") + self._write(name) + if asname is not None: + self._write(" as "+asname) + + def _Keyword(self, t): + """ Keyword value assignment within function calls and definitions. + """ + self._write(t.name) + self._write("=") + self._dispatch(t.expr) + + def _List(self, t): + self._write("[") + for i,node in enumerate(t.nodes): + self._dispatch(node) + if i < len(t.nodes)-1: + self._write(", ") + self._write("]") + + def _Module(self, t): + if t.doc is not None: + self._dispatch(t.doc) + self._dispatch(t.node) + + def _Mul(self, t): + self.__binary_op(t, '*') + + def _Name(self, t): + self._write(t.name) + + def _NoneType(self, t): + self._write("None") + + def _Not(self, t): + self._write('not (') + self._dispatch(t.expr) + self._write(')') + + def _Or(self, t): + self._write(" (") + for i, node in enumerate(t.nodes): + self._dispatch(node) + if i != len(t.nodes)-1: + self._write(") or (") + self._write(")") + + def _Pass(self, t): + self._write("pass\n") + + def _Printnl(self, t): + self._fill("print ") + if t.dest: + self._write(">> ") + self._dispatch(t.dest) + self._write(", ") + comma = False + for node in t.nodes: + if comma: self._write(', ') + else: comma = True + self._dispatch(node) + + def _Power(self, t): + self.__binary_op(t, '**') + + def _Return(self, t): + self._fill("return ") + if t.value: + if isinstance(t.value, Tuple): + text = ', '.join([ name.name for name in t.value.asList() ]) + self._write(text) + else: + self._dispatch(t.value) + if not self._do_indent: + self._write('; ') + + def _Slice(self, t): + self._dispatch(t.expr) + self._write("[") + if t.lower: + self._dispatch(t.lower) + self._write(":") + if t.upper: + self._dispatch(t.upper) + #if t.step: + # self._write(":") + # self._dispatch(t.step) + self._write("]") + + def _Sliceobj(self, t): + for i, node in enumerate(t.nodes): + if i != 0: + self._write(":") + if not (isinstance(node, Const) and node.value is None): + self._dispatch(node) + + def _Stmt(self, tree): + for node in tree.nodes: + self._dispatch(node) + + def _Sub(self, t): + self.__binary_op(t, '-') + + def _Subscript(self, t): + self._dispatch(t.expr) + self._write("[") + for i, value in enumerate(t.subs): + if i != 0: + self._write(",") + self._dispatch(value) + self._write("]") + + def _TryExcept(self, t): + self._fill("try") + self._enter() + self._dispatch(t.body) + self._leave() + + for handler in t.handlers: + self._fill('except ') + self._dispatch(handler[0]) + if handler[1] is not None: + self._write(', ') + self._dispatch(handler[1]) + self._enter() + self._dispatch(handler[2]) + self._leave() + + if t.else_: + self._fill("else") + self._enter() + self._dispatch(t.else_) + self._leave() + + def _Tuple(self, t): + + if not t.nodes: + # Empty tuple. + self._write("()") + else: + self._write("(") + + # _write each elements, separated by a comma. + for element in t.nodes[:-1]: + self._dispatch(element) + self._write(", ") + + # Handle the last one without writing comma + last_element = t.nodes[-1] + self._dispatch(last_element) + + self._write(")") + + def _UnaryAdd(self, t): + self._write("+") + self._dispatch(t.expr) + + def _UnarySub(self, t): + self._write("-") + self._dispatch(t.expr) + + def _With(self, t): + self._fill('with ') + self._dispatch(t.expr) + if t.vars: + self._write(' as ') + self._dispatch(t.vars.name) + self._enter() + self._dispatch(t.body) + self._leave() + self._write('\n') + + def _int(self, t): + self._write(repr(t)) + + def __binary_op(self, t, symbol): + # Check if parenthesis are needed on left side and then dispatch + has_paren = False + left_class = str(t.left.__class__) + if (left_class in op_precedence.keys() and + op_precedence[left_class] < op_precedence[str(t.__class__)]): + has_paren = True + if has_paren: + self._write('(') + self._dispatch(t.left) + if has_paren: + self._write(')') + # Write the appropriate symbol for operator + self._write(symbol) + # Check if parenthesis are needed on the right side and then dispatch + has_paren = False + right_class = str(t.right.__class__) + if (right_class in op_precedence.keys() and + op_precedence[right_class] < op_precedence[str(t.__class__)]): + has_paren = True + if has_paren: + self._write('(') + self._dispatch(t.right) + if has_paren: + self._write(')') + + def _float(self, t): + # if t is 0.1, str(t)->'0.1' while repr(t)->'0.1000000000001' + # We prefer str here. + self._write(str(t)) + + def _str(self, t): + self._write(repr(t)) + + def _tuple(self, t): + self._write(str(t)) + + ######################################################################### + # These are the methods from the _ast modules unparse. + # + # As our needs to handle more advanced code increase, we may want to + # modify some of the methods below so that they work for compiler.ast. + ######################################################################### + +# # stmt +# def _Expr(self, tree): +# self._fill() +# self._dispatch(tree.value) +# +# def _Import(self, t): +# self._fill("import ") +# first = True +# for a in t.names: +# if first: +# first = False +# else: +# self._write(", ") +# self._write(a.name) +# if a.asname: +# self._write(" as "+a.asname) +# +## def _ImportFrom(self, t): +## self._fill("from ") +## self._write(t.module) +## self._write(" import ") +## for i, a in enumerate(t.names): +## if i == 0: +## self._write(", ") +## self._write(a.name) +## if a.asname: +## self._write(" as "+a.asname) +## # XXX(jpe) what is level for? +## +# +# def _Break(self, t): +# self._fill("break") +# +# def _Continue(self, t): +# self._fill("continue") +# +# def _Delete(self, t): +# self._fill("del ") +# self._dispatch(t.targets) +# +# def _Assert(self, t): +# self._fill("assert ") +# self._dispatch(t.test) +# if t.msg: +# self._write(", ") +# self._dispatch(t.msg) +# +# def _Exec(self, t): +# self._fill("exec ") +# self._dispatch(t.body) +# if t.globals: +# self._write(" in ") +# self._dispatch(t.globals) +# if t.locals: +# self._write(", ") +# self._dispatch(t.locals) +# +# def _Print(self, t): +# self._fill("print ") +# do_comma = False +# if t.dest: +# self._write(">>") +# self._dispatch(t.dest) +# do_comma = True +# for e in t.values: +# if do_comma:self._write(", ") +# else:do_comma=True +# self._dispatch(e) +# if not t.nl: +# self._write(",") +# +# def _Global(self, t): +# self._fill("global") +# for i, n in enumerate(t.names): +# if i != 0: +# self._write(",") +# self._write(" " + n) +# +# def _Yield(self, t): +# self._fill("yield") +# if t.value: +# self._write(" (") +# self._dispatch(t.value) +# self._write(")") +# +# def _Raise(self, t): +# self._fill('raise ') +# if t.type: +# self._dispatch(t.type) +# if t.inst: +# self._write(", ") +# self._dispatch(t.inst) +# if t.tback: +# self._write(", ") +# self._dispatch(t.tback) +# +# +# def _TryFinally(self, t): +# self._fill("try") +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# self._fill("finally") +# self._enter() +# self._dispatch(t.finalbody) +# self._leave() +# +# def _excepthandler(self, t): +# self._fill("except ") +# if t.type: +# self._dispatch(t.type) +# if t.name: +# self._write(", ") +# self._dispatch(t.name) +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# def _ClassDef(self, t): +# self._write("\n") +# self._fill("class "+t.name) +# if t.bases: +# self._write("(") +# for a in t.bases: +# self._dispatch(a) +# self._write(", ") +# self._write(")") +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# def _FunctionDef(self, t): +# self._write("\n") +# for deco in t.decorators: +# self._fill("@") +# self._dispatch(deco) +# self._fill("def "+t.name + "(") +# self._dispatch(t.args) +# self._write(")") +# self._enter() +# self._dispatch(t.body) +# self._leave() +# +# def _For(self, t): +# self._fill("for ") +# self._dispatch(t.target) +# self._write(" in ") +# self._dispatch(t.iter) +# self._enter() +# self._dispatch(t.body) +# self._leave() +# if t.orelse: +# self._fill("else") +# self._enter() +# self._dispatch(t.orelse) +# self._leave +# +# def _While(self, t): +# self._fill("while ") +# self._dispatch(t.test) +# self._enter() +# self._dispatch(t.body) +# self._leave() +# if t.orelse: +# self._fill("else") +# self._enter() +# self._dispatch(t.orelse) +# self._leave +# +# # expr +# def _Str(self, tree): +# self._write(repr(tree.s)) +## +# def _Repr(self, t): +# self._write("`") +# self._dispatch(t.value) +# self._write("`") +# +# def _Num(self, t): +# self._write(repr(t.n)) +# +# def _ListComp(self, t): +# self._write("[") +# self._dispatch(t.elt) +# for gen in t.generators: +# self._dispatch(gen) +# self._write("]") +# +# def _GeneratorExp(self, t): +# self._write("(") +# self._dispatch(t.elt) +# for gen in t.generators: +# self._dispatch(gen) +# self._write(")") +# +# def _comprehension(self, t): +# self._write(" for ") +# self._dispatch(t.target) +# self._write(" in ") +# self._dispatch(t.iter) +# for if_clause in t.ifs: +# self._write(" if ") +# self._dispatch(if_clause) +# +# def _IfExp(self, t): +# self._dispatch(t.body) +# self._write(" if ") +# self._dispatch(t.test) +# if t.orelse: +# self._write(" else ") +# self._dispatch(t.orelse) +# +# unop = {"Invert":"~", "Not": "not", "UAdd":"+", "USub":"-"} +# def _UnaryOp(self, t): +# self._write(self.unop[t.op.__class__.__name__]) +# self._write("(") +# self._dispatch(t.operand) +# self._write(")") +# +# binop = { "Add":"+", "Sub":"-", "Mult":"*", "Div":"/", "Mod":"%", +# "LShift":">>", "RShift":"<<", "BitOr":"|", "BitXor":"^", "BitAnd":"&", +# "FloorDiv":"//", "Pow": "**"} +# def _BinOp(self, t): +# self._write("(") +# self._dispatch(t.left) +# self._write(")" + self.binop[t.op.__class__.__name__] + "(") +# self._dispatch(t.right) +# self._write(")") +# +# boolops = {_ast.And: 'and', _ast.Or: 'or'} +# def _BoolOp(self, t): +# self._write("(") +# self._dispatch(t.values[0]) +# for v in t.values[1:]: +# self._write(" %s " % self.boolops[t.op.__class__]) +# self._dispatch(v) +# self._write(")") +# +# def _Attribute(self,t): +# self._dispatch(t.value) +# self._write(".") +# self._write(t.attr) +# +## def _Call(self, t): +## self._dispatch(t.func) +## self._write("(") +## comma = False +## for e in t.args: +## if comma: self._write(", ") +## else: comma = True +## self._dispatch(e) +## for e in t.keywords: +## if comma: self._write(", ") +## else: comma = True +## self._dispatch(e) +## if t.starargs: +## if comma: self._write(", ") +## else: comma = True +## self._write("*") +## self._dispatch(t.starargs) +## if t.kwargs: +## if comma: self._write(", ") +## else: comma = True +## self._write("**") +## self._dispatch(t.kwargs) +## self._write(")") +# +# # slice +# def _Index(self, t): +# self._dispatch(t.value) +# +# def _ExtSlice(self, t): +# for i, d in enumerate(t.dims): +# if i != 0: +# self._write(': ') +# self._dispatch(d) +# +# # others +# def _arguments(self, t): +# first = True +# nonDef = len(t.args)-len(t.defaults) +# for a in t.args[0:nonDef]: +# if first:first = False +# else: self._write(", ") +# self._dispatch(a) +# for a,d in zip(t.args[nonDef:], t.defaults): +# if first:first = False +# else: self._write(", ") +# self._dispatch(a), +# self._write("=") +# self._dispatch(d) +# if t.vararg: +# if first:first = False +# else: self._write(", ") +# self._write("*"+t.vararg) +# if t.kwarg: +# if first:first = False +# else: self._write(", ") +# self._write("**"+t.kwarg) +# +## def _keyword(self, t): +## self._write(t.arg) +## self._write("=") +## self._dispatch(t.value) +# +# def _Lambda(self, t): +# self._write("lambda ") +# self._dispatch(t.args) +# self._write(": ") +# self._dispatch(t.body) + + + diff --git a/doc/sphinxext/docscrape.py b/doc/sphinxext/docscrape.py new file mode 100755 index 00000000..63fec42a --- /dev/null +++ b/doc/sphinxext/docscrape.py @@ -0,0 +1,499 @@ +"""Extract reference documentation from the NumPy source tree. + +""" + +import inspect +import textwrap +import re +import pydoc +from StringIO import StringIO +from warnings import warn + +class Reader(object): + """A line-based string reader. + + """ + def __init__(self, data): + """ + Parameters + ---------- + data : str + String with lines separated by '\n'. + + """ + if isinstance(data,list): + self._str = data + else: + self._str = data.split('\n') # store string as list of lines + + self.reset() + + def __getitem__(self, n): + return self._str[n] + + def reset(self): + self._l = 0 # current line nr + + def read(self): + if not self.eof(): + out = self[self._l] + self._l += 1 + return out + else: + return '' + + def seek_next_non_empty_line(self): + for l in self[self._l:]: + if l.strip(): + break + else: + self._l += 1 + + def eof(self): + return self._l >= len(self._str) + + def read_to_condition(self, condition_func): + start = self._l + for line in self[start:]: + if condition_func(line): + return self[start:self._l] + self._l += 1 + if self.eof(): + return self[start:self._l+1] + return [] + + def read_to_next_empty_line(self): + self.seek_next_non_empty_line() + def is_empty(line): + return not line.strip() + return self.read_to_condition(is_empty) + + def read_to_next_unindented_line(self): + def is_unindented(line): + return (line.strip() and (len(line.lstrip()) == len(line))) + return self.read_to_condition(is_unindented) + + def peek(self,n=0): + if self._l + n < len(self._str): + return self[self._l + n] + else: + return '' + + def is_empty(self): + return not ''.join(self._str).strip() + + +class NumpyDocString(object): + def __init__(self, docstring, config={}): + docstring = textwrap.dedent(docstring).split('\n') + + self._doc = Reader(docstring) + self._parsed_data = { + 'Signature': '', + 'Summary': [''], + 'Extended Summary': [], + 'Parameters': [], + 'Returns': [], + 'Raises': [], + 'Warns': [], + 'Other Parameters': [], + 'Attributes': [], + 'Methods': [], + 'See Also': [], + 'Notes': [], + 'Warnings': [], + 'References': '', + 'Examples': '', + 'index': {} + } + + self._parse() + + def __getitem__(self,key): + return self._parsed_data[key] + + def __setitem__(self,key,val): + if not self._parsed_data.has_key(key): + warn("Unknown section %s" % key) + else: + self._parsed_data[key] = val + + def _is_at_section(self): + self._doc.seek_next_non_empty_line() + + if self._doc.eof(): + return False + + l1 = self._doc.peek().strip() # e.g. Parameters + + if l1.startswith('.. index::'): + return True + + l2 = self._doc.peek(1).strip() # ---------- or ========== + return l2.startswith('-'*len(l1)) or l2.startswith('='*len(l1)) + + def _strip(self,doc): + i = 0 + j = 0 + for i,line in enumerate(doc): + if line.strip(): break + + for j,line in enumerate(doc[::-1]): + if line.strip(): break + + return doc[i:len(doc)-j] + + def _read_to_next_section(self): + section = self._doc.read_to_next_empty_line() + + while not self._is_at_section() and not self._doc.eof(): + if not self._doc.peek(-1).strip(): # previous line was empty + section += [''] + + section += self._doc.read_to_next_empty_line() + + return section + + def _read_sections(self): + while not self._doc.eof(): + data = self._read_to_next_section() + name = data[0].strip() + + if name.startswith('..'): # index section + yield name, data[1:] + elif len(data) < 2: + yield StopIteration + else: + yield name, self._strip(data[2:]) + + def _parse_param_list(self,content): + r = Reader(content) + params = [] + while not r.eof(): + header = r.read().strip() + if ' : ' in header: + arg_name, arg_type = header.split(' : ')[:2] + else: + arg_name, arg_type = header, '' + + desc = r.read_to_next_unindented_line() + desc = dedent_lines(desc) + + params.append((arg_name,arg_type,desc)) + + return params + + + _name_rgx = re.compile(r"^\s*(:(?P\w+):`(?P[a-zA-Z0-9_.-]+)`|" + r" (?P[a-zA-Z0-9_.-]+))\s*", re.X) + def _parse_see_also(self, content): + """ + func_name : Descriptive text + continued text + another_func_name : Descriptive text + func_name1, func_name2, :meth:`func_name`, func_name3 + + """ + items = [] + + def parse_item_name(text): + """Match ':role:`name`' or 'name'""" + m = self._name_rgx.match(text) + if m: + g = m.groups() + if g[1] is None: + return g[3], None + else: + return g[2], g[1] + raise ValueError("%s is not a item name" % text) + + def push_item(name, rest): + if not name: + return + name, role = parse_item_name(name) + items.append((name, list(rest), role)) + del rest[:] + + current_func = None + rest = [] + + for line in content: + if not line.strip(): continue + + m = self._name_rgx.match(line) + if m and line[m.end():].strip().startswith(':'): + push_item(current_func, rest) + current_func, line = line[:m.end()], line[m.end():] + rest = [line.split(':', 1)[1].strip()] + if not rest[0]: + rest = [] + elif not line.startswith(' '): + push_item(current_func, rest) + current_func = None + if ',' in line: + for func in line.split(','): + if func.strip(): + push_item(func, []) + elif line.strip(): + current_func = line + elif current_func is not None: + rest.append(line.strip()) + push_item(current_func, rest) + return items + + def _parse_index(self, section, content): + """ + .. index: default + :refguide: something, else, and more + + """ + def strip_each_in(lst): + return [s.strip() for s in lst] + + out = {} + section = section.split('::') + if len(section) > 1: + out['default'] = strip_each_in(section[1].split(','))[0] + for line in content: + line = line.split(':') + if len(line) > 2: + out[line[1]] = strip_each_in(line[2].split(',')) + return out + + def _parse_summary(self): + """Grab signature (if given) and summary""" + if self._is_at_section(): + return + + summary = self._doc.read_to_next_empty_line() + summary_str = " ".join([s.strip() for s in summary]).strip() + if re.compile('^([\w., ]+=)?\s*[\w\.]+\(.*\)$').match(summary_str): + self['Signature'] = summary_str + if not self._is_at_section(): + self['Summary'] = self._doc.read_to_next_empty_line() + else: + self['Summary'] = summary + + if not self._is_at_section(): + self['Extended Summary'] = self._read_to_next_section() + + def _parse(self): + self._doc.reset() + self._parse_summary() + + for (section,content) in self._read_sections(): + if not section.startswith('..'): + section = ' '.join([s.capitalize() for s in section.split(' ')]) + if section in ('Parameters', 'Attributes', 'Methods', + 'Returns', 'Raises', 'Warns'): + self[section] = self._parse_param_list(content) + elif section.startswith('.. index::'): + self['index'] = self._parse_index(section, content) + elif section == 'See Also': + self['See Also'] = self._parse_see_also(content) + else: + self[section] = content + + # string conversion routines + + def _str_header(self, name, symbol='-'): + return [name, len(name)*symbol] + + def _str_indent(self, doc, indent=4): + out = [] + for line in doc: + out += [' '*indent + line] + return out + + def _str_signature(self): + if self['Signature']: + return [self['Signature'].replace('*','\*')] + [''] + else: + return [''] + + def _str_summary(self): + if self['Summary']: + return self['Summary'] + [''] + else: + return [] + + def _str_extended_summary(self): + if self['Extended Summary']: + return self['Extended Summary'] + [''] + else: + return [] + + def _str_param_list(self, name): + out = [] + if self[name]: + out += self._str_header(name) + for param,param_type,desc in self[name]: + out += ['%s : %s' % (param, param_type)] + out += self._str_indent(desc) + out += [''] + return out + + def _str_section(self, name): + out = [] + if self[name]: + out += self._str_header(name) + out += self[name] + out += [''] + return out + + def _str_see_also(self, func_role): + if not self['See Also']: return [] + out = [] + out += self._str_header("See Also") + last_had_desc = True + for func, desc, role in self['See Also']: + if role: + link = ':%s:`%s`' % (role, func) + elif func_role: + link = ':%s:`%s`' % (func_role, func) + else: + link = "`%s`_" % func + if desc or last_had_desc: + out += [''] + out += [link] + else: + out[-1] += ", %s" % link + if desc: + out += self._str_indent([' '.join(desc)]) + last_had_desc = True + else: + last_had_desc = False + out += [''] + return out + + def _str_index(self): + idx = self['index'] + out = [] + out += ['.. index:: %s' % idx.get('default','')] + for section, references in idx.iteritems(): + if section == 'default': + continue + out += [' :%s: %s' % (section, ', '.join(references))] + return out + + def __str__(self, func_role=''): + out = [] + out += self._str_signature() + out += self._str_summary() + out += self._str_extended_summary() + for param_list in ('Parameters','Returns','Raises'): + out += self._str_param_list(param_list) + out += self._str_section('Warnings') + out += self._str_see_also(func_role) + for s in ('Notes','References','Examples'): + out += self._str_section(s) + for param_list in ('Attributes', 'Methods'): + out += self._str_param_list(param_list) + out += self._str_index() + return '\n'.join(out) + + +def indent(str,indent=4): + indent_str = ' '*indent + if str is None: + return indent_str + lines = str.split('\n') + return '\n'.join(indent_str + l for l in lines) + +def dedent_lines(lines): + """Deindent a list of lines maximally""" + return textwrap.dedent("\n".join(lines)).split("\n") + +def header(text, style='-'): + return text + '\n' + style*len(text) + '\n' + + +class FunctionDoc(NumpyDocString): + def __init__(self, func, role='func', doc=None, config={}): + self._f = func + self._role = role # e.g. "func" or "meth" + + if doc is None: + if func is None: + raise ValueError("No function or docstring given") + doc = inspect.getdoc(func) or '' + NumpyDocString.__init__(self, doc) + + if not self['Signature'] and func is not None: + func, func_name = self.get_func() + try: + # try to read signature + argspec = inspect.getargspec(func) + argspec = inspect.formatargspec(*argspec) + argspec = argspec.replace('*','\*') + signature = '%s%s' % (func_name, argspec) + except TypeError, e: + signature = '%s()' % func_name + self['Signature'] = signature + + def get_func(self): + func_name = getattr(self._f, '__name__', self.__class__.__name__) + if inspect.isclass(self._f): + func = getattr(self._f, '__call__', self._f.__init__) + else: + func = self._f + return func, func_name + + def __str__(self): + out = '' + + func, func_name = self.get_func() + signature = self['Signature'].replace('*', '\*') + + roles = {'func': 'function', + 'meth': 'method'} + + if self._role: + if not roles.has_key(self._role): + print "Warning: invalid role %s" % self._role + out += '.. %s:: %s\n \n\n' % (roles.get(self._role,''), + func_name) + + out += super(FunctionDoc, self).__str__(func_role=self._role) + return out + + +class ClassDoc(NumpyDocString): + def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, + config={}): + if not inspect.isclass(cls) and cls is not None: + raise ValueError("Expected a class or None, but got %r" % cls) + self._cls = cls + + if modulename and not modulename.endswith('.'): + modulename += '.' + self._mod = modulename + + if doc is None: + if cls is None: + raise ValueError("No class or documentation string given") + doc = pydoc.getdoc(cls) + + NumpyDocString.__init__(self, doc) + + if config.get('show_class_members', True): + if not self['Methods']: + self['Methods'] = [(name, '', '') + for name in sorted(self.methods)] + if not self['Attributes']: + self['Attributes'] = [(name, '', '') + for name in sorted(self.properties)] + + @property + def methods(self): + if self._cls is None: + return [] + return [name for name,func in inspect.getmembers(self._cls) + if not name.startswith('_') and callable(func)] + + @property + def properties(self): + if self._cls is None: + return [] + return [name for name,func in inspect.getmembers(self._cls) + if not name.startswith('_') and func is None] diff --git a/doc/sphinxext/docscrape_sphinx.py b/doc/sphinxext/docscrape_sphinx.py new file mode 100755 index 00000000..9f4350d4 --- /dev/null +++ b/doc/sphinxext/docscrape_sphinx.py @@ -0,0 +1,226 @@ +import re, inspect, textwrap, pydoc +import sphinx +from docscrape import NumpyDocString, FunctionDoc, ClassDoc + +class SphinxDocString(NumpyDocString): + def __init__(self, docstring, config={}): + self.use_plots = config.get('use_plots', False) + NumpyDocString.__init__(self, docstring, config=config) + + # string conversion routines + def _str_header(self, name, symbol='`'): + return ['.. rubric:: ' + name, ''] + + def _str_field_list(self, name): + return [':' + name + ':'] + + def _str_indent(self, doc, indent=4): + out = [] + for line in doc: + out += [' '*indent + line] + return out + + def _str_signature(self): + return [''] + if self['Signature']: + return ['``%s``' % self['Signature']] + [''] + else: + return [''] + + def _str_summary(self): + return self['Summary'] + [''] + + def _str_extended_summary(self): + return self['Extended Summary'] + [''] + + def _str_param_list(self, name): + out = [] + if self[name]: + out += self._str_field_list(name) + out += [''] + for param,param_type,desc in self[name]: + out += self._str_indent(['**%s** : %s' % (param.strip(), + param_type)]) + out += [''] + out += self._str_indent(desc,8) + out += [''] + return out + + @property + def _obj(self): + if hasattr(self, '_cls'): + return self._cls + elif hasattr(self, '_f'): + return self._f + return None + + def _str_member_list(self, name): + """ + Generate a member listing, autosummary:: table where possible, + and a table where not. + + """ + out = [] + if self[name]: + out += ['.. rubric:: %s' % name, ''] + prefix = getattr(self, '_name', '') + + if prefix: + prefix = '~%s.' % prefix + + autosum = [] + others = [] + for param, param_type, desc in self[name]: + param = param.strip() + if not self._obj or hasattr(self._obj, param): + autosum += [" %s%s" % (prefix, param)] + else: + others.append((param, param_type, desc)) + + if autosum: + out += ['.. autosummary::', ' :toctree:', ''] + out += autosum + + if others: + maxlen_0 = max([len(x[0]) for x in others]) + maxlen_1 = max([len(x[1]) for x in others]) + hdr = "="*maxlen_0 + " " + "="*maxlen_1 + " " + "="*10 + fmt = '%%%ds %%%ds ' % (maxlen_0, maxlen_1) + n_indent = maxlen_0 + maxlen_1 + 4 + out += [hdr] + for param, param_type, desc in others: + out += [fmt % (param.strip(), param_type)] + out += self._str_indent(desc, n_indent) + out += [hdr] + out += [''] + return out + + def _str_section(self, name): + out = [] + if self[name]: + out += self._str_header(name) + out += [''] + content = textwrap.dedent("\n".join(self[name])).split("\n") + out += content + out += [''] + return out + + def _str_see_also(self, func_role): + out = [] + if self['See Also']: + see_also = super(SphinxDocString, self)._str_see_also(func_role) + out = ['.. seealso::', ''] + out += self._str_indent(see_also[2:]) + return out + + def _str_warnings(self): + out = [] + if self['Warnings']: + out = ['.. warning::', ''] + out += self._str_indent(self['Warnings']) + return out + + def _str_index(self): + idx = self['index'] + out = [] + if len(idx) == 0: + return out + + out += ['.. index:: %s' % idx.get('default','')] + for section, references in idx.iteritems(): + if section == 'default': + continue + elif section == 'refguide': + out += [' single: %s' % (', '.join(references))] + else: + out += [' %s: %s' % (section, ','.join(references))] + return out + + def _str_references(self): + out = [] + if self['References']: + out += self._str_header('References') + if isinstance(self['References'], str): + self['References'] = [self['References']] + out.extend(self['References']) + out += [''] + # Latex collects all references to a separate bibliography, + # so we need to insert links to it + if sphinx.__version__ >= "0.6": + out += ['.. only:: latex',''] + else: + out += ['.. latexonly::',''] + items = [] + for line in self['References']: + m = re.match(r'.. \[([a-z0-9._-]+)\]', line, re.I) + if m: + items.append(m.group(1)) + out += [' ' + ", ".join(["[%s]_" % item for item in items]), ''] + return out + + def _str_examples(self): + examples_str = "\n".join(self['Examples']) + + if (self.use_plots and 'import matplotlib' in examples_str + and 'plot::' not in examples_str): + out = [] + out += self._str_header('Examples') + out += ['.. plot::', ''] + out += self._str_indent(self['Examples']) + out += [''] + return out + else: + return self._str_section('Examples') + + def __str__(self, indent=0, func_role="obj"): + out = [] + out += self._str_signature() + out += self._str_index() + [''] + out += self._str_summary() + out += self._str_extended_summary() + for param_list in ('Parameters', 'Returns', 'Raises'): + out += self._str_param_list(param_list) + out += self._str_warnings() + out += self._str_see_also(func_role) + out += self._str_section('Notes') + out += self._str_references() + out += self._str_examples() + for param_list in ('Attributes', 'Methods'): + out += self._str_member_list(param_list) + out = self._str_indent(out,indent) + return '\n'.join(out) + +class SphinxFunctionDoc(SphinxDocString, FunctionDoc): + def __init__(self, obj, doc=None, config={}): + self.use_plots = config.get('use_plots', False) + FunctionDoc.__init__(self, obj, doc=doc, config=config) + +class SphinxClassDoc(SphinxDocString, ClassDoc): + def __init__(self, obj, doc=None, func_doc=None, config={}): + self.use_plots = config.get('use_plots', False) + ClassDoc.__init__(self, obj, doc=doc, func_doc=None, config=config) + +class SphinxObjDoc(SphinxDocString): + def __init__(self, obj, doc=None, config={}): + self._f = obj + SphinxDocString.__init__(self, doc, config=config) + +def get_doc_object(obj, what=None, doc=None, config={}): + if what is None: + if inspect.isclass(obj): + what = 'class' + elif inspect.ismodule(obj): + what = 'module' + elif callable(obj): + what = 'function' + else: + what = 'object' + if what == 'class': + return SphinxClassDoc(obj, func_doc=SphinxFunctionDoc, doc=doc, + config=config) + elif what in ('function', 'method'): + return SphinxFunctionDoc(obj, doc=doc, config=config) + else: + if doc is None: + doc = pydoc.getdoc(obj) + return SphinxObjDoc(obj, doc, config=config) diff --git a/doc/sphinxext/ipython_console_highlighting.py b/doc/sphinxext/ipython_console_highlighting.py new file mode 100644 index 00000000..f0a41beb --- /dev/null +++ b/doc/sphinxext/ipython_console_highlighting.py @@ -0,0 +1,114 @@ +"""reST directive for syntax-highlighting ipython interactive sessions. + +XXX - See what improvements can be made based on the new (as of Sept 2009) +'pycon' lexer for the python console. At the very least it will give better +highlighted tracebacks. +""" + +#----------------------------------------------------------------------------- +# Needed modules + +# Standard library +import re + +# Third party +from pygments.lexer import Lexer, do_insertions +from pygments.lexers.agile import (PythonConsoleLexer, PythonLexer, + PythonTracebackLexer) +from pygments.token import Comment, Generic + +from sphinx import highlighting + +#----------------------------------------------------------------------------- +# Global constants +line_re = re.compile('.*?\n') + +#----------------------------------------------------------------------------- +# Code begins - classes and functions + +class IPythonConsoleLexer(Lexer): + """ + For IPython console output or doctests, such as: + + .. sourcecode:: ipython + + In [1]: a = 'foo' + + In [2]: a + Out[2]: 'foo' + + In [3]: print a + foo + + In [4]: 1 / 0 + + Notes: + + - Tracebacks are not currently supported. + + - It assumes the default IPython prompts, not customized ones. + """ + + name = 'IPython console session' + aliases = ['ipython'] + mimetypes = ['text/x-ipython-console'] + input_prompt = re.compile("(In \[[0-9]+\]: )|( \.\.\.+:)") + output_prompt = re.compile("(Out\[[0-9]+\]: )|( \.\.\.+:)") + continue_prompt = re.compile(" \.\.\.+:") + tb_start = re.compile("\-+") + + def get_tokens_unprocessed(self, text): + pylexer = PythonLexer(**self.options) + tblexer = PythonTracebackLexer(**self.options) + + curcode = '' + insertions = [] + for match in line_re.finditer(text): + line = match.group() + input_prompt = self.input_prompt.match(line) + continue_prompt = self.continue_prompt.match(line.rstrip()) + output_prompt = self.output_prompt.match(line) + if line.startswith("#"): + insertions.append((len(curcode), + [(0, Comment, line)])) + elif input_prompt is not None: + insertions.append((len(curcode), + [(0, Generic.Prompt, input_prompt.group())])) + curcode += line[input_prompt.end():] + elif continue_prompt is not None: + insertions.append((len(curcode), + [(0, Generic.Prompt, continue_prompt.group())])) + curcode += line[continue_prompt.end():] + elif output_prompt is not None: + # Use the 'error' token for output. We should probably make + # our own token, but error is typicaly in a bright color like + # red, so it works fine for our output prompts. + insertions.append((len(curcode), + [(0, Generic.Error, output_prompt.group())])) + curcode += line[output_prompt.end():] + else: + if curcode: + for item in do_insertions(insertions, + pylexer.get_tokens_unprocessed(curcode)): + yield item + curcode = '' + insertions = [] + yield match.start(), Generic.Output, line + if curcode: + for item in do_insertions(insertions, + pylexer.get_tokens_unprocessed(curcode)): + yield item + + +def setup(app): + """Setup as a sphinx extension.""" + + # This is only a lexer, so adding it below to pygments appears sufficient. + # But if somebody knows that the right API usage should be to do that via + # sphinx, by all means fix it here. At least having this setup.py + # suppresses the sphinx warning we'd get without it. + pass + +#----------------------------------------------------------------------------- +# Register the extension as a valid pygments lexer +highlighting.lexers['ipython'] = IPythonConsoleLexer() diff --git a/doc/sphinxext/ipython_directive.py b/doc/sphinxext/ipython_directive.py new file mode 100644 index 00000000..9b09ff95 --- /dev/null +++ b/doc/sphinxext/ipython_directive.py @@ -0,0 +1,909 @@ +# -*- coding: utf-8 -*- +"""Sphinx directive to support embedded IPython code. + +This directive allows pasting of entire interactive IPython sessions, prompts +and all, and their code will actually get re-executed at doc build time, with +all prompts renumbered sequentially. It also allows you to input code as a pure +python input by giving the argument python to the directive. The output looks +like an interactive ipython section. + +To enable this directive, simply list it in your Sphinx ``conf.py`` file +(making sure the directory where you placed it is visible to sphinx, as is +needed for all Sphinx directives). + +By default this directive assumes that your prompts are unchanged IPython ones, +but this can be customized. The configurable options that can be placed in +conf.py are + +ipython_savefig_dir: + The directory in which to save the figures. This is relative to the + Sphinx source directory. The default is `html_static_path`. +ipython_rgxin: + The compiled regular expression to denote the start of IPython input + lines. The default is re.compile('In \[(\d+)\]:\s?(.*)\s*'). You + shouldn't need to change this. +ipython_rgxout: + The compiled regular expression to denote the start of IPython output + lines. The default is re.compile('Out\[(\d+)\]:\s?(.*)\s*'). You + shouldn't need to change this. +ipython_promptin: + The string to represent the IPython input prompt in the generated ReST. + The default is 'In [%d]:'. This expects that the line numbers are used + in the prompt. +ipython_promptout: + + The string to represent the IPython prompt in the generated ReST. The + default is 'Out [%d]:'. This expects that the line numbers are used + in the prompt. + +ToDo +---- + +- Turn the ad-hoc test() function into a real test suite. +- Break up ipython-specific functionality from matplotlib stuff into better + separated code. + +Authors +------- + +- John D Hunter: orignal author. +- Fernando Perez: refactoring, documentation, cleanups, port to 0.11. +- VĂĄclavĹ milauer : Prompt generalizations. +- Skipper Seabold, refactoring, cleanups, pure python addition +""" + +#----------------------------------------------------------------------------- +# Imports +#----------------------------------------------------------------------------- + +# Stdlib +import ast +import cStringIO +import os +import re +import sys +import tempfile + +# To keep compatibility with various python versions +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +# Third-party +import matplotlib +import sphinx +from docutils.parsers.rst import directives +from docutils import nodes +from sphinx.util.compat import Directive + +matplotlib.use('Agg') + +# Our own +from IPython import Config, InteractiveShell +from IPython.core.profiledir import ProfileDir +from IPython.utils import io + +from pdb import set_trace + +#----------------------------------------------------------------------------- +# Globals +#----------------------------------------------------------------------------- +# for tokenizing blocks +COMMENT, INPUT, OUTPUT = range(3) + +#----------------------------------------------------------------------------- +# Functions and class declarations +#----------------------------------------------------------------------------- +def block_parser(part, rgxin, rgxout, fmtin, fmtout): + """ + part is a string of ipython text, comprised of at most one + input, one ouput, comments, and blank lines. The block parser + parses the text into a list of:: + + blocks = [ (TOKEN0, data0), (TOKEN1, data1), ...] + + where TOKEN is one of [COMMENT | INPUT | OUTPUT ] and + data is, depending on the type of token:: + + COMMENT : the comment string + + INPUT: the (DECORATOR, INPUT_LINE, REST) where + DECORATOR: the input decorator (or None) + INPUT_LINE: the input as string (possibly multi-line) + REST : any stdout generated by the input line (not OUTPUT) + + + OUTPUT: the output string, possibly multi-line + """ + + block = [] + lines = part.split('\n') + N = len(lines) + i = 0 + decorator = None + while 1: + + if i==N: + # nothing left to parse -- the last line + break + + line = lines[i] + i += 1 + line_stripped = line.strip() + if line_stripped.startswith('#'): + block.append((COMMENT, line)) + continue + + if line_stripped.startswith('@'): + # we're assuming at most one decorator -- may need to + # rethink + decorator = line_stripped + continue + + # does this look like an input line? + matchin = rgxin.match(line) + if matchin: + lineno, inputline = int(matchin.group(1)), matchin.group(2) + + # the ....: continuation string + continuation = ' %s:'% ''.join(['.']*(len(str(lineno))+2)) + Nc = len(continuation) + # input lines can continue on for more than one line, if + # we have a '\' line continuation char or a function call + # echo line 'print'. The input line can only be + # terminated by the end of the block or an output line, so + # we parse out the rest of the input line if it is + # multiline as well as any echo text + + rest = [] + while i 0] + + for lineno, line in enumerate(content): + + line_stripped = line.strip() + if not len(line): + output.append(line) + continue + + # handle decorators + if line_stripped.startswith('@'): + output.extend([line]) + if 'savefig' in line: + savefig = True # and need to clear figure + continue + + # handle comments + if line_stripped.startswith('#'): + output.extend([line]) + continue + + continuation = u' %s:'% ''.join(['.']*(len(str(ct))+2)) + if not multiline: + modified = u"%s %s" % (fmtin % ct, line_stripped) + output.append(modified) + ct += 1 + try: + ast.parse(line_stripped) + output.append(u'') + except Exception: + multiline = True + multiline_start = lineno + else: + modified = u'%s %s' % (continuation, line) + output.append(modified) + + try: + ast.parse('\n'.join(content[multiline_start:lineno+1])) + + if (lineno < len(content) - 1 and + _count_indent(content[multiline_start]) < + _count_indent(content[lineno + 1])): + + continue + + output.extend([continuation, u'']) + multiline = False + except Exception: + pass + + continue + + return output + +def _count_indent(x): + import re + m = re.match('(\s+)(.*)', x) + if not m: + return 0 + return len(m.group(1)) + +class IpythonDirective(Directive): + + has_content = True + required_arguments = 0 + optional_arguments = 4 # python, suppress, verbatim, doctest + final_argumuent_whitespace = True + option_spec = { 'python': directives.unchanged, + 'suppress' : directives.flag, + 'verbatim' : directives.flag, + 'doctest' : directives.flag, + 'okexcept' : directives.flag, + } + + shell = EmbeddedSphinxShell() + + def get_config_options(self): + # contains sphinx configuration variables + config = self.state.document.settings.env.config + + # get config variables to set figure output directory + confdir = self.state.document.settings.env.app.confdir + savefig_dir = config.ipython_savefig_dir + source_dir = os.path.dirname(self.state.document.current_source) + if savefig_dir is None: + savefig_dir = config.html_static_path + if isinstance(savefig_dir, list): + savefig_dir = savefig_dir[0] # safe to assume only one path? + savefig_dir = os.path.join(confdir, savefig_dir) + + # get regex and prompt stuff + rgxin = config.ipython_rgxin + rgxout = config.ipython_rgxout + promptin = config.ipython_promptin + promptout = config.ipython_promptout + + return savefig_dir, source_dir, rgxin, rgxout, promptin, promptout + + def setup(self): + # get config values + (savefig_dir, source_dir, rgxin, + rgxout, promptin, promptout) = self.get_config_options() + + # and attach to shell so we don't have to pass them around + self.shell.rgxin = rgxin + self.shell.rgxout = rgxout + self.shell.promptin = promptin + self.shell.promptout = promptout + self.shell.savefig_dir = savefig_dir + self.shell.source_dir = source_dir + + # setup bookmark for saving figures directory + + self.shell.process_input_line('bookmark ipy_savedir %s'%savefig_dir, + store_history=False) + self.shell.clear_cout() + + return rgxin, rgxout, promptin, promptout + + + def teardown(self): + # delete last bookmark + self.shell.process_input_line('bookmark -d ipy_savedir', + store_history=False) + self.shell.clear_cout() + + def run(self): + debug = False + + #TODO, any reason block_parser can't be a method of embeddable shell + # then we wouldn't have to carry these around + rgxin, rgxout, promptin, promptout = self.setup() + + options = self.options + self.shell.is_suppress = 'suppress' in options + self.shell.is_doctest = 'doctest' in options + self.shell.is_verbatim = 'verbatim' in options + self.shell.is_okexcept = 'okexcept' in options + self.shell.current_content = self.content + + # handle pure python code + if 'python' in self.arguments: + content = self.content + self.content = self.shell.process_pure_python2(content) + + parts = '\n'.join(self.content).split('\n\n') + + lines = ['.. code-block:: ipython',''] + figures = [] + + for part in parts: + + block = block_parser(part, rgxin, rgxout, promptin, promptout) + + if len(block): + rows, figure = self.shell.process_block(block) + for row in rows: + # hack + # if row == '': + # continue + + # lines.extend([' %s'% row.strip()]) + lines.extend([' %s' % line + for line in re.split('[\n]+', row)]) + + if figure is not None: + figures.append(figure) + + #text = '\n'.join(lines) + #figs = '\n'.join(figures) + + for figure in figures: + lines.append('') + lines.extend(figure.split('\n')) + lines.append('') + + #print lines + if len(lines)>2: + if debug: + print '\n'.join(lines) + else: #NOTE: this raises some errors, what's it for? + #print 'INSERTING %d lines'%len(lines) + self.state_machine.insert_input( + lines, self.state_machine.input_lines.source(0)) + + text = '\n'.join(lines) + txtnode = nodes.literal_block(text, text) + txtnode['language'] = 'ipython' + #imgnode = nodes.image(figs) + + # cleanup + self.teardown() + + return []#, imgnode] + +# Enable as a proper Sphinx directive +def setup(app): + setup.app = app + + app.add_directive('ipython', IpythonDirective) + app.add_config_value('ipython_savefig_dir', None, True) + app.add_config_value('ipython_rgxin', + re.compile('In \[(\d+)\]:\s?(.*)\s*'), True) + app.add_config_value('ipython_rgxout', + re.compile('Out\[(\d+)\]:\s?(.*)\s*'), True) + app.add_config_value('ipython_promptin', 'In [%d]:', True) + app.add_config_value('ipython_promptout', 'Out[%d]:', True) + + +# Simple smoke test, needs to be converted to a proper automatic test. +def test(): + + examples = [ + r""" +In [9]: pwd +Out[9]: '/home/jdhunter/py4science/book' + +In [10]: cd bookdata/ +/home/jdhunter/py4science/book/bookdata + +In [2]: from pylab import * + +In [2]: ion() + +In [3]: im = imread('stinkbug.png') + +@savefig mystinkbug.png width=4in +In [4]: imshow(im) +Out[4]: + +""", + r""" + +In [1]: x = 'hello world' + +# string methods can be +# used to alter the string +@doctest +In [2]: x.upper() +Out[2]: 'HELLO WORLD' + +@verbatim +In [3]: x.st +x.startswith x.strip +""", + r""" + +In [130]: url = 'http://ichart.finance.yahoo.com/table.csv?s=CROX\ + .....: &d=9&e=22&f=2009&g=d&a=1&br=8&c=2006&ignore=.csv' + +In [131]: print url.split('&') +['http://ichart.finance.yahoo.com/table.csv?s=CROX', 'd=9', 'e=22', 'f=2009', 'g=d', 'a=1', 'b=8', 'c=2006', 'ignore=.csv'] + +In [60]: import urllib + +""", + r"""\ + +In [133]: import numpy.random + +@suppress +In [134]: numpy.random.seed(2358) + +@doctest +In [135]: numpy.random.rand(10,2) +Out[135]: +array([[ 0.64524308, 0.59943846], + [ 0.47102322, 0.8715456 ], + [ 0.29370834, 0.74776844], + [ 0.99539577, 0.1313423 ], + [ 0.16250302, 0.21103583], + [ 0.81626524, 0.1312433 ], + [ 0.67338089, 0.72302393], + [ 0.7566368 , 0.07033696], + [ 0.22591016, 0.77731835], + [ 0.0072729 , 0.34273127]]) + +""", + + r""" +In [106]: print x +jdh + +In [109]: for i in range(10): + n +.....: print i + .....: + .....: +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +""", + + r""" + +In [144]: from pylab import * + +In [145]: ion() + +# use a semicolon to suppress the output +@savefig test_hist.png width=4in +In [151]: hist(np.random.randn(10000), 100); + + +@savefig test_plot.png width=4in +In [151]: plot(np.random.randn(10000), 'o'); + """, + + r""" +# use a semicolon to suppress the output +In [151]: plt.clf() + +@savefig plot_simple.png width=4in +In [151]: plot([1,2,3]) + +@savefig hist_simple.png width=4in +In [151]: hist(np.random.randn(10000), 100); + +""", + r""" +# update the current fig +In [151]: ylabel('number') + +In [152]: title('normal distribution') + + +@savefig hist_with_text.png +In [153]: grid(True) + + """, + ] + # skip local-file depending first example: + examples = examples[1:] + + #ipython_directive.DEBUG = True # dbg + #options = dict(suppress=True) # dbg + options = dict() + for example in examples: + content = example.split('\n') + ipython_directive('debug', arguments=None, options=options, + content=content, lineno=0, + content_offset=None, block_text=None, + state=None, state_machine=None, + ) + +# Run test suite as a script +if __name__=='__main__': + if not os.path.isdir('_static'): + os.mkdir('_static') + test() + print 'All OK? Check figures in _static/' diff --git a/doc/sphinxext/numpydoc.py b/doc/sphinxext/numpydoc.py new file mode 100755 index 00000000..43c67336 --- /dev/null +++ b/doc/sphinxext/numpydoc.py @@ -0,0 +1,169 @@ +""" +======== +numpydoc +======== + +Sphinx extension that handles docstrings in the Numpy standard format. [1] + +It will: + +- Convert Parameters etc. sections to field lists. +- Convert See Also section to a See also entry. +- Renumber references. +- Extract the signature from the docstring, if it can't be determined otherwise. + +.. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard + +""" + +import sphinx + +if sphinx.__version__ < '1.0.1': + raise RuntimeError("Sphinx 1.0.1 or newer is required") + +import os, re, pydoc +from docscrape_sphinx import get_doc_object, SphinxDocString +from sphinx.util.compat import Directive +import inspect + +def mangle_docstrings(app, what, name, obj, options, lines, + reference_offset=[0]): + + cfg = dict(use_plots=app.config.numpydoc_use_plots, + show_class_members=app.config.numpydoc_show_class_members) + + if what == 'module': + # Strip top title + title_re = re.compile(ur'^\s*[#*=]{4,}\n[a-z0-9 -]+\n[#*=]{4,}\s*', + re.I|re.S) + lines[:] = title_re.sub(u'', u"\n".join(lines)).split(u"\n") + else: + doc = get_doc_object(obj, what, u"\n".join(lines), config=cfg) + lines[:] = unicode(doc).split(u"\n") + + if app.config.numpydoc_edit_link and hasattr(obj, '__name__') and \ + obj.__name__: + if hasattr(obj, '__module__'): + v = dict(full_name=u"%s.%s" % (obj.__module__, obj.__name__)) + else: + v = dict(full_name=obj.__name__) + lines += [u'', u'.. htmlonly::', ''] + lines += [u' %s' % x for x in + (app.config.numpydoc_edit_link % v).split("\n")] + + # replace reference numbers so that there are no duplicates + references = [] + for line in lines: + line = line.strip() + m = re.match(ur'^.. \[([a-z0-9_.-])\]', line, re.I) + if m: + references.append(m.group(1)) + + # start renaming from the longest string, to avoid overwriting parts + references.sort(key=lambda x: -len(x)) + if references: + for i, line in enumerate(lines): + for r in references: + if re.match(ur'^\d+$', r): + new_r = u"R%d" % (reference_offset[0] + int(r)) + else: + new_r = u"%s%d" % (r, reference_offset[0]) + lines[i] = lines[i].replace(u'[%s]_' % r, + u'[%s]_' % new_r) + lines[i] = lines[i].replace(u'.. [%s]' % r, + u'.. [%s]' % new_r) + + reference_offset[0] += len(references) + +def mangle_signature(app, what, name, obj, options, sig, retann): + # Do not try to inspect classes that don't define `__init__` + if (inspect.isclass(obj) and + (not hasattr(obj, '__init__') or + 'initializes x; see ' in pydoc.getdoc(obj.__init__))): + return '', '' + + if not (callable(obj) or hasattr(obj, '__argspec_is_invalid_')): return + if not hasattr(obj, '__doc__'): return + + doc = SphinxDocString(pydoc.getdoc(obj)) + if doc['Signature']: + sig = re.sub(u"^[^(]*", u"", doc['Signature']) + return sig, u'' + +def setup(app, get_doc_object_=get_doc_object): + global get_doc_object + get_doc_object = get_doc_object_ + + app.connect('autodoc-process-docstring', mangle_docstrings) + app.connect('autodoc-process-signature', mangle_signature) + app.add_config_value('numpydoc_edit_link', None, False) + app.add_config_value('numpydoc_use_plots', None, False) + app.add_config_value('numpydoc_show_class_members', True, True) + + # Extra mangling domains + app.add_domain(NumpyPythonDomain) + app.add_domain(NumpyCDomain) + +#------------------------------------------------------------------------------ +# Docstring-mangling domains +#------------------------------------------------------------------------------ + +from docutils.statemachine import ViewList +from sphinx.domains.c import CDomain +from sphinx.domains.python import PythonDomain + +class ManglingDomainBase(object): + directive_mangling_map = {} + + def __init__(self, *a, **kw): + super(ManglingDomainBase, self).__init__(*a, **kw) + self.wrap_mangling_directives() + + def wrap_mangling_directives(self): + for name, objtype in self.directive_mangling_map.items(): + self.directives[name] = wrap_mangling_directive( + self.directives[name], objtype) + +class NumpyPythonDomain(ManglingDomainBase, PythonDomain): + name = 'np' + directive_mangling_map = { + 'function': 'function', + 'class': 'class', + 'exception': 'class', + 'method': 'function', + 'classmethod': 'function', + 'staticmethod': 'function', + 'attribute': 'attribute', + } + +class NumpyCDomain(ManglingDomainBase, CDomain): + name = 'np-c' + directive_mangling_map = { + 'function': 'function', + 'member': 'attribute', + 'macro': 'function', + 'type': 'class', + 'var': 'object', + } + +def wrap_mangling_directive(base_directive, objtype): + class directive(base_directive): + def run(self): + env = self.state.document.settings.env + + name = None + if self.arguments: + m = re.match(r'^(.*\s+)?(.*?)(\(.*)?', self.arguments[0]) + name = m.group(2).strip() + + if not name: + name = self.arguments[0] + + lines = list(self.content) + mangle_docstrings(env.app, objtype, name, None, None, lines) + self.content = ViewList(lines, self.content.parent) + + return base_directive.run(self) + + return directive + diff --git a/doc/sphinxext/only_directives.py b/doc/sphinxext/only_directives.py new file mode 100755 index 00000000..c0dff7e6 --- /dev/null +++ b/doc/sphinxext/only_directives.py @@ -0,0 +1,96 @@ +# +# A pair of directives for inserting content that will only appear in +# either html or latex. +# + +from docutils.nodes import Body, Element +from docutils.writers.html4css1 import HTMLTranslator +try: + from sphinx.latexwriter import LaTeXTranslator +except ImportError: + from sphinx.writers.latex import LaTeXTranslator + + import warnings + warnings.warn("The numpydoc.only_directives module is deprecated;" + "please use the only:: directive available in Sphinx >= 0.6", + DeprecationWarning, stacklevel=2) + +from docutils.parsers.rst import directives + +class html_only(Body, Element): + pass + +class latex_only(Body, Element): + pass + +def run(content, node_class, state, content_offset): + text = '\n'.join(content) + node = node_class(text) + state.nested_parse(content, content_offset, node) + return [node] + +try: + from docutils.parsers.rst import Directive +except ImportError: + from docutils.parsers.rst.directives import _directives + + def html_only_directive(name, arguments, options, content, lineno, + content_offset, block_text, state, state_machine): + return run(content, html_only, state, content_offset) + + def latex_only_directive(name, arguments, options, content, lineno, + content_offset, block_text, state, state_machine): + return run(content, latex_only, state, content_offset) + + for func in (html_only_directive, latex_only_directive): + func.content = 1 + func.options = {} + func.arguments = None + + _directives['htmlonly'] = html_only_directive + _directives['latexonly'] = latex_only_directive +else: + class OnlyDirective(Directive): + has_content = True + required_arguments = 0 + optional_arguments = 0 + final_argument_whitespace = True + option_spec = {} + + def run(self): + self.assert_has_content() + return run(self.content, self.node_class, + self.state, self.content_offset) + + class HtmlOnlyDirective(OnlyDirective): + node_class = html_only + + class LatexOnlyDirective(OnlyDirective): + node_class = latex_only + + directives.register_directive('htmlonly', HtmlOnlyDirective) + directives.register_directive('latexonly', LatexOnlyDirective) + +def setup(app): + app.add_node(html_only) + app.add_node(latex_only) + + # Add visit/depart methods to HTML-Translator: + def visit_perform(self, node): + pass + def depart_perform(self, node): + pass + def visit_ignore(self, node): + node.children = [] + def depart_ignore(self, node): + node.children = [] + + HTMLTranslator.visit_html_only = visit_perform + HTMLTranslator.depart_html_only = depart_perform + HTMLTranslator.visit_latex_only = visit_ignore + HTMLTranslator.depart_latex_only = depart_ignore + + LaTeXTranslator.visit_html_only = visit_ignore + LaTeXTranslator.depart_html_only = depart_ignore + LaTeXTranslator.visit_latex_only = visit_perform + LaTeXTranslator.depart_latex_only = depart_perform diff --git a/doc/sphinxext/phantom_import.py b/doc/sphinxext/phantom_import.py new file mode 100755 index 00000000..c77eeb54 --- /dev/null +++ b/doc/sphinxext/phantom_import.py @@ -0,0 +1,162 @@ +""" +============== +phantom_import +============== + +Sphinx extension to make directives from ``sphinx.ext.autodoc`` and similar +extensions to use docstrings loaded from an XML file. + +This extension loads an XML file in the Pydocweb format [1] and +creates a dummy module that contains the specified docstrings. This +can be used to get the current docstrings from a Pydocweb instance +without needing to rebuild the documented module. + +.. [1] http://code.google.com/p/pydocweb + +""" +import imp, sys, compiler, types, os, inspect, re + +def setup(app): + app.connect('builder-inited', initialize) + app.add_config_value('phantom_import_file', None, True) + +def initialize(app): + fn = app.config.phantom_import_file + if (fn and os.path.isfile(fn)): + print "[numpydoc] Phantom importing modules from", fn, "..." + import_phantom_module(fn) + +#------------------------------------------------------------------------------ +# Creating 'phantom' modules from an XML description +#------------------------------------------------------------------------------ +def import_phantom_module(xml_file): + """ + Insert a fake Python module to sys.modules, based on a XML file. + + The XML file is expected to conform to Pydocweb DTD. The fake + module will contain dummy objects, which guarantee the following: + + - Docstrings are correct. + - Class inheritance relationships are correct (if present in XML). + - Function argspec is *NOT* correct (even if present in XML). + Instead, the function signature is prepended to the function docstring. + - Class attributes are *NOT* correct; instead, they are dummy objects. + + Parameters + ---------- + xml_file : str + Name of an XML file to read + + """ + import lxml.etree as etree + + object_cache = {} + + tree = etree.parse(xml_file) + root = tree.getroot() + + # Sort items so that + # - Base classes come before classes inherited from them + # - Modules come before their contents + all_nodes = dict([(n.attrib['id'], n) for n in root]) + + def _get_bases(node, recurse=False): + bases = [x.attrib['ref'] for x in node.findall('base')] + if recurse: + j = 0 + while True: + try: + b = bases[j] + except IndexError: break + if b in all_nodes: + bases.extend(_get_bases(all_nodes[b])) + j += 1 + return bases + + type_index = ['module', 'class', 'callable', 'object'] + + def base_cmp(a, b): + x = cmp(type_index.index(a.tag), type_index.index(b.tag)) + if x != 0: return x + + if a.tag == 'class' and b.tag == 'class': + a_bases = _get_bases(a, recurse=True) + b_bases = _get_bases(b, recurse=True) + x = cmp(len(a_bases), len(b_bases)) + if x != 0: return x + if a.attrib['id'] in b_bases: return -1 + if b.attrib['id'] in a_bases: return 1 + + return cmp(a.attrib['id'].count('.'), b.attrib['id'].count('.')) + + nodes = root.getchildren() + nodes.sort(base_cmp) + + # Create phantom items + for node in nodes: + name = node.attrib['id'] + doc = (node.text or '').decode('string-escape') + "\n" + if doc == "\n": doc = "" + + # create parent, if missing + parent = name + while True: + parent = '.'.join(parent.split('.')[:-1]) + if not parent: break + if parent in object_cache: break + obj = imp.new_module(parent) + object_cache[parent] = obj + sys.modules[parent] = obj + + # create object + if node.tag == 'module': + obj = imp.new_module(name) + obj.__doc__ = doc + sys.modules[name] = obj + elif node.tag == 'class': + bases = [object_cache[b] for b in _get_bases(node) + if b in object_cache] + bases.append(object) + init = lambda self: None + init.__doc__ = doc + obj = type(name, tuple(bases), {'__doc__': doc, '__init__': init}) + obj.__name__ = name.split('.')[-1] + elif node.tag == 'callable': + funcname = node.attrib['id'].split('.')[-1] + argspec = node.attrib.get('argspec') + if argspec: + argspec = re.sub('^[^(]*', '', argspec) + doc = "%s%s\n\n%s" % (funcname, argspec, doc) + obj = lambda: 0 + obj.__argspec_is_invalid_ = True + obj.func_name = funcname + obj.__name__ = name + obj.__doc__ = doc + if inspect.isclass(object_cache[parent]): + obj.__objclass__ = object_cache[parent] + else: + class Dummy(object): pass + obj = Dummy() + obj.__name__ = name + obj.__doc__ = doc + if inspect.isclass(object_cache[parent]): + obj.__get__ = lambda: None + object_cache[name] = obj + + if parent: + if inspect.ismodule(object_cache[parent]): + obj.__module__ = parent + setattr(object_cache[parent], name.split('.')[-1], obj) + + # Populate items + for node in root: + obj = object_cache.get(node.attrib['id']) + if obj is None: continue + for ref in node.findall('ref'): + if node.tag == 'class': + if ref.attrib['ref'].startswith(node.attrib['id'] + '.'): + setattr(obj, ref.attrib['name'], + object_cache.get(ref.attrib['ref'])) + else: + setattr(obj, ref.attrib['name'], + object_cache.get(ref.attrib['ref'])) diff --git a/doc/sphinxext/plot_directive.py b/doc/sphinxext/plot_directive.py new file mode 100755 index 00000000..cacd53db --- /dev/null +++ b/doc/sphinxext/plot_directive.py @@ -0,0 +1,641 @@ +""" +A special directive for generating a matplotlib plot. + +.. warning:: + + This is a hacked version of plot_directive.py from Matplotlib. + It's very much subject to change! + + +Usage +----- + +Can be used like this:: + + .. plot:: examples/example.py + + .. plot:: + + import matplotlib.pyplot as plt + plt.plot([1,2,3], [4,5,6]) + + .. plot:: + + A plotting example: + + >>> import matplotlib.pyplot as plt + >>> plt.plot([1,2,3], [4,5,6]) + +The content is interpreted as doctest formatted if it has a line starting +with ``>>>``. + +The ``plot`` directive supports the options + + format : {'python', 'doctest'} + Specify the format of the input + + include-source : bool + Whether to display the source code. Default can be changed in conf.py + +and the ``image`` directive options ``alt``, ``height``, ``width``, +``scale``, ``align``, ``class``. + +Configuration options +--------------------- + +The plot directive has the following configuration options: + + plot_include_source + Default value for the include-source option + + plot_pre_code + Code that should be executed before each plot. + + plot_basedir + Base directory, to which plot:: file names are relative to. + (If None or empty, file names are relative to the directoly where + the file containing the directive is.) + + plot_formats + File formats to generate. List of tuples or strings:: + + [(suffix, dpi), suffix, ...] + + that determine the file format and the DPI. For entries whose + DPI was omitted, sensible defaults are chosen. + + plot_html_show_formats + Whether to show links to the files in HTML. + +TODO +---- + +* Refactor Latex output; now it's plain images, but it would be nice + to make them appear side-by-side, or in floats. + +""" + +import sys, os, glob, shutil, imp, warnings, cStringIO, re, textwrap, traceback +import sphinx + +import warnings +warnings.warn("A plot_directive module is also available under " + "matplotlib.sphinxext; expect this numpydoc.plot_directive " + "module to be deprecated after relevant features have been " + "integrated there.", + FutureWarning, stacklevel=2) + + +#------------------------------------------------------------------------------ +# Registration hook +#------------------------------------------------------------------------------ + +def setup(app): + setup.app = app + setup.config = app.config + setup.confdir = app.confdir + + app.add_config_value('plot_pre_code', '', True) + app.add_config_value('plot_include_source', False, True) + app.add_config_value('plot_formats', ['png', 'hires.png', 'pdf'], True) + app.add_config_value('plot_basedir', None, True) + app.add_config_value('plot_html_show_formats', True, True) + + app.add_directive('plot', plot_directive, True, (0, 1, False), + **plot_directive_options) + +#------------------------------------------------------------------------------ +# plot:: directive +#------------------------------------------------------------------------------ +from docutils.parsers.rst import directives +from docutils import nodes + +def plot_directive(name, arguments, options, content, lineno, + content_offset, block_text, state, state_machine): + return run(arguments, content, options, state_machine, state, lineno) +plot_directive.__doc__ = __doc__ + +def _option_boolean(arg): + if not arg or not arg.strip(): + # no argument given, assume used as a flag + return True + elif arg.strip().lower() in ('no', '0', 'false'): + return False + elif arg.strip().lower() in ('yes', '1', 'true'): + return True + else: + raise ValueError('"%s" unknown boolean' % arg) + +def _option_format(arg): + return directives.choice(arg, ('python', 'lisp')) + +def _option_align(arg): + return directives.choice(arg, ("top", "middle", "bottom", "left", "center", + "right")) + +plot_directive_options = {'alt': directives.unchanged, + 'height': directives.length_or_unitless, + 'width': directives.length_or_percentage_or_unitless, + 'scale': directives.nonnegative_int, + 'align': _option_align, + 'class': directives.class_option, + 'include-source': _option_boolean, + 'format': _option_format, + } + +#------------------------------------------------------------------------------ +# Generating output +#------------------------------------------------------------------------------ + +from docutils import nodes, utils + +try: + # Sphinx depends on either Jinja or Jinja2 + import jinja2 + def format_template(template, **kw): + return jinja2.Template(template).render(**kw) +except ImportError: + import jinja + def format_template(template, **kw): + return jinja.from_string(template, **kw) + +TEMPLATE = """ +{{ source_code }} + +{{ only_html }} + + {% if source_link or (html_show_formats and not multi_image) %} + ( + {%- if source_link -%} + `Source code <{{ source_link }}>`__ + {%- endif -%} + {%- if html_show_formats and not multi_image -%} + {%- for img in images -%} + {%- for fmt in img.formats -%} + {%- if source_link or not loop.first -%}, {% endif -%} + `{{ fmt }} <{{ dest_dir }}/{{ img.basename }}.{{ fmt }}>`__ + {%- endfor -%} + {%- endfor -%} + {%- endif -%} + ) + {% endif %} + + {% for img in images %} + .. figure:: {{ build_dir }}/{{ img.basename }}.png + {%- for option in options %} + {{ option }} + {% endfor %} + + {% if html_show_formats and multi_image -%} + ( + {%- for fmt in img.formats -%} + {%- if not loop.first -%}, {% endif -%} + `{{ fmt }} <{{ dest_dir }}/{{ img.basename }}.{{ fmt }}>`__ + {%- endfor -%} + ) + {%- endif -%} + {% endfor %} + +{{ only_latex }} + + {% for img in images %} + .. image:: {{ build_dir }}/{{ img.basename }}.pdf + {% endfor %} + +""" + +class ImageFile(object): + def __init__(self, basename, dirname): + self.basename = basename + self.dirname = dirname + self.formats = [] + + def filename(self, format): + return os.path.join(self.dirname, "%s.%s" % (self.basename, format)) + + def filenames(self): + return [self.filename(fmt) for fmt in self.formats] + +def run(arguments, content, options, state_machine, state, lineno): + if arguments and content: + raise RuntimeError("plot:: directive can't have both args and content") + + document = state_machine.document + config = document.settings.env.config + + options.setdefault('include-source', config.plot_include_source) + + # determine input + rst_file = document.attributes['source'] + rst_dir = os.path.dirname(rst_file) + + if arguments: + if not config.plot_basedir: + source_file_name = os.path.join(rst_dir, + directives.uri(arguments[0])) + else: + source_file_name = os.path.join(setup.confdir, config.plot_basedir, + directives.uri(arguments[0])) + code = open(source_file_name, 'r').read() + output_base = os.path.basename(source_file_name) + else: + source_file_name = rst_file + code = textwrap.dedent("\n".join(map(str, content))) + counter = document.attributes.get('_plot_counter', 0) + 1 + document.attributes['_plot_counter'] = counter + base, ext = os.path.splitext(os.path.basename(source_file_name)) + output_base = '%s-%d.py' % (base, counter) + + base, source_ext = os.path.splitext(output_base) + if source_ext in ('.py', '.rst', '.txt'): + output_base = base + else: + source_ext = '' + + # ensure that LaTeX includegraphics doesn't choke in foo.bar.pdf filenames + output_base = output_base.replace('.', '-') + + # is it in doctest format? + is_doctest = contains_doctest(code) + if options.has_key('format'): + if options['format'] == 'python': + is_doctest = False + else: + is_doctest = True + + # determine output directory name fragment + source_rel_name = relpath(source_file_name, setup.confdir) + source_rel_dir = os.path.dirname(source_rel_name) + while source_rel_dir.startswith(os.path.sep): + source_rel_dir = source_rel_dir[1:] + + # build_dir: where to place output files (temporarily) + build_dir = os.path.join(os.path.dirname(setup.app.doctreedir), + 'plot_directive', + source_rel_dir) + if not os.path.exists(build_dir): + os.makedirs(build_dir) + + # output_dir: final location in the builder's directory + dest_dir = os.path.abspath(os.path.join(setup.app.builder.outdir, + source_rel_dir)) + + # how to link to files from the RST file + dest_dir_link = os.path.join(relpath(setup.confdir, rst_dir), + source_rel_dir).replace(os.path.sep, '/') + build_dir_link = relpath(build_dir, rst_dir).replace(os.path.sep, '/') + source_link = dest_dir_link + '/' + output_base + source_ext + + # make figures + try: + results = makefig(code, source_file_name, build_dir, output_base, + config) + errors = [] + except PlotError, err: + reporter = state.memo.reporter + sm = reporter.system_message( + 2, "Exception occurred in plotting %s: %s" % (output_base, err), + line=lineno) + results = [(code, [])] + errors = [sm] + + # generate output restructuredtext + total_lines = [] + for j, (code_piece, images) in enumerate(results): + if options['include-source']: + if is_doctest: + lines = [''] + lines += [row.rstrip() for row in code_piece.split('\n')] + else: + lines = ['.. code-block:: python', ''] + lines += [' %s' % row.rstrip() + for row in code_piece.split('\n')] + source_code = "\n".join(lines) + else: + source_code = "" + + opts = [':%s: %s' % (key, val) for key, val in options.items() + if key in ('alt', 'height', 'width', 'scale', 'align', 'class')] + + if sphinx.__version__ >= "0.6": + only_html = ".. only:: html" + only_latex = ".. only:: latex" + else: + only_html = ".. htmlonly::" + only_latex = ".. latexonly::" + + if j == 0: + src_link = source_link + else: + src_link = None + + result = format_template( + TEMPLATE, + dest_dir=dest_dir_link, + build_dir=build_dir_link, + source_link=src_link, + multi_image=len(images) > 1, + only_html=only_html, + only_latex=only_latex, + options=opts, + images=images, + source_code=source_code, + html_show_formats=config.plot_html_show_formats) + + total_lines.extend(result.split("\n")) + total_lines.extend("\n") + + if total_lines: + state_machine.insert_input(total_lines, source=source_file_name) + + # copy image files to builder's output directory + if not os.path.exists(dest_dir): + os.makedirs(dest_dir) + + for code_piece, images in results: + for img in images: + for fn in img.filenames(): + shutil.copyfile(fn, os.path.join(dest_dir, + os.path.basename(fn))) + + # copy script (if necessary) + if source_file_name == rst_file: + target_name = os.path.join(dest_dir, output_base + source_ext) + f = open(target_name, 'w') + f.write(unescape_doctest(code)) + f.close() + + return errors + + +#------------------------------------------------------------------------------ +# Run code and capture figures +#------------------------------------------------------------------------------ + +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import matplotlib.image as image +from matplotlib import _pylab_helpers + +import exceptions + +def contains_doctest(text): + try: + # check if it's valid Python as-is + compile(text, '', 'exec') + return False + except SyntaxError: + pass + r = re.compile(r'^\s*>>>', re.M) + m = r.search(text) + return bool(m) + +def unescape_doctest(text): + """ + Extract code from a piece of text, which contains either Python code + or doctests. + + """ + if not contains_doctest(text): + return text + + code = "" + for line in text.split("\n"): + m = re.match(r'^\s*(>>>|\.\.\.) (.*)$', line) + if m: + code += m.group(2) + "\n" + elif line.strip(): + code += "# " + line.strip() + "\n" + else: + code += "\n" + return code + +def split_code_at_show(text): + """ + Split code at plt.show() + + """ + + parts = [] + is_doctest = contains_doctest(text) + + part = [] + for line in text.split("\n"): + if (not is_doctest and line.strip() == 'plt.show()') or \ + (is_doctest and line.strip() == '>>> plt.show()'): + part.append(line) + parts.append("\n".join(part)) + part = [] + else: + part.append(line) + if "\n".join(part).strip(): + parts.append("\n".join(part)) + return parts + +class PlotError(RuntimeError): + pass + +def run_code(code, code_path, ns=None): + # Change the working directory to the directory of the example, so + # it can get at its data files, if any. + pwd = os.getcwd() + old_sys_path = list(sys.path) + if code_path is not None: + dirname = os.path.abspath(os.path.dirname(code_path)) + os.chdir(dirname) + sys.path.insert(0, dirname) + + # Redirect stdout + stdout = sys.stdout + sys.stdout = cStringIO.StringIO() + + # Reset sys.argv + old_sys_argv = sys.argv + sys.argv = [code_path] + + try: + try: + code = unescape_doctest(code) + if ns is None: + ns = {} + if not ns: + exec setup.config.plot_pre_code in ns + exec code in ns + except (Exception, SystemExit), err: + raise PlotError(traceback.format_exc()) + finally: + os.chdir(pwd) + sys.argv = old_sys_argv + sys.path[:] = old_sys_path + sys.stdout = stdout + return ns + + +#------------------------------------------------------------------------------ +# Generating figures +#------------------------------------------------------------------------------ + +def out_of_date(original, derived): + """ + Returns True if derivative is out-of-date wrt original, + both of which are full file paths. + """ + return (not os.path.exists(derived) + or os.stat(derived).st_mtime < os.stat(original).st_mtime) + + +def makefig(code, code_path, output_dir, output_base, config): + """ + Run a pyplot script *code* and save the images under *output_dir* + with file names derived from *output_base* + + """ + + # -- Parse format list + default_dpi = {'png': 80, 'hires.png': 200, 'pdf': 50} + formats = [] + for fmt in config.plot_formats: + if isinstance(fmt, str): + formats.append((fmt, default_dpi.get(fmt, 80))) + elif type(fmt) in (tuple, list) and len(fmt)==2: + formats.append((str(fmt[0]), int(fmt[1]))) + else: + raise PlotError('invalid image format "%r" in plot_formats' % fmt) + + # -- Try to determine if all images already exist + + code_pieces = split_code_at_show(code) + + # Look for single-figure output files first + all_exists = True + img = ImageFile(output_base, output_dir) + for format, dpi in formats: + if out_of_date(code_path, img.filename(format)): + all_exists = False + break + img.formats.append(format) + + if all_exists: + return [(code, [img])] + + # Then look for multi-figure output files + results = [] + all_exists = True + for i, code_piece in enumerate(code_pieces): + images = [] + for j in xrange(1000): + img = ImageFile('%s_%02d_%02d' % (output_base, i, j), output_dir) + for format, dpi in formats: + if out_of_date(code_path, img.filename(format)): + all_exists = False + break + img.formats.append(format) + + # assume that if we have one, we have them all + if not all_exists: + all_exists = (j > 0) + break + images.append(img) + if not all_exists: + break + results.append((code_piece, images)) + + if all_exists: + return results + + # -- We didn't find the files, so build them + + results = [] + ns = {} + + for i, code_piece in enumerate(code_pieces): + # Clear between runs + plt.close('all') + + # Run code + run_code(code_piece, code_path, ns) + + # Collect images + images = [] + fig_managers = _pylab_helpers.Gcf.get_all_fig_managers() + for j, figman in enumerate(fig_managers): + if len(fig_managers) == 1 and len(code_pieces) == 1: + img = ImageFile(output_base, output_dir) + else: + img = ImageFile("%s_%02d_%02d" % (output_base, i, j), + output_dir) + images.append(img) + for format, dpi in formats: + try: + figman.canvas.figure.savefig(img.filename(format), dpi=dpi, + bbox_inches='tight') + except exceptions.BaseException, err: + raise PlotError(traceback.format_exc()) + img.formats.append(format) + + # Results + results.append((code_piece, images)) + + return results + + +#------------------------------------------------------------------------------ +# Relative pathnames +#------------------------------------------------------------------------------ + +try: + from os.path import relpath +except ImportError: + # Copied from Python 2.7 + if 'posix' in sys.builtin_module_names: + def relpath(path, start=os.path.curdir): + """Return a relative version of a path""" + from os.path import sep, curdir, join, abspath, commonprefix, \ + pardir + + if not path: + raise ValueError("no path specified") + + start_list = abspath(start).split(sep) + path_list = abspath(path).split(sep) + + # Work out how much of the filepath is shared by start and path. + i = len(commonprefix([start_list, path_list])) + + rel_list = [pardir] * (len(start_list)-i) + path_list[i:] + if not rel_list: + return curdir + return join(*rel_list) + elif 'nt' in sys.builtin_module_names: + def relpath(path, start=os.path.curdir): + """Return a relative version of a path""" + from os.path import sep, curdir, join, abspath, commonprefix, \ + pardir, splitunc + + if not path: + raise ValueError("no path specified") + start_list = abspath(start).split(sep) + path_list = abspath(path).split(sep) + if start_list[0].lower() != path_list[0].lower(): + unc_path, rest = splitunc(path) + unc_start, rest = splitunc(start) + if bool(unc_path) ^ bool(unc_start): + raise ValueError("Cannot mix UNC and non-UNC paths (%s and %s)" + % (path, start)) + else: + raise ValueError("path is on drive %s, start on drive %s" + % (path_list[0], start_list[0])) + # Work out how much of the filepath is shared by start and path. + for i in range(min(len(start_list), len(path_list))): + if start_list[i].lower() != path_list[i].lower(): + break + else: + i += 1 + + rel_list = [pardir] * (len(start_list)-i) + path_list[i:] + if not rel_list: + return curdir + return join(*rel_list) + else: + raise RuntimeError("Unsupported platform (no relpath available!)") diff --git a/doc/sphinxext/setup.py b/doc/sphinxext/setup.py new file mode 100755 index 00000000..016d8f8a --- /dev/null +++ b/doc/sphinxext/setup.py @@ -0,0 +1,31 @@ +from distutils.core import setup +import setuptools +import sys, os + +version = "0.3.dev" + +setup( + name="numpydoc", + packages=["numpydoc"], + package_dir={"numpydoc": ""}, + version=version, + description="Sphinx extension to support docstrings in Numpy format", + # classifiers from http://pypi.python.org/pypi?%3Aaction=list_classifiers + classifiers=["Development Status :: 3 - Alpha", + "Environment :: Plugins", + "License :: OSI Approved :: BSD License", + "Topic :: Documentation"], + keywords="sphinx numpy", + author="Pauli Virtanen and others", + author_email="pav@iki.fi", + url="http://projects.scipy.org/numpy/browser/trunk/doc/sphinxext", + license="BSD", + zip_safe=False, + install_requires=["Sphinx >= 0.5"], + package_data={'numpydoc': 'tests', '': ''}, + entry_points={ + "console_scripts": [ + "autosummary_generate = numpydoc.autosummary_generate:main", + ], + }, +) diff --git a/doc/sphinxext/tests/test_docscrape.py b/doc/sphinxext/tests/test_docscrape.py new file mode 100755 index 00000000..1d775e99 --- /dev/null +++ b/doc/sphinxext/tests/test_docscrape.py @@ -0,0 +1,545 @@ +# -*- encoding:utf-8 -*- + +import sys, os +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + +from docscrape import NumpyDocString, FunctionDoc, ClassDoc +from docscrape_sphinx import SphinxDocString, SphinxClassDoc +from nose.tools import * + +doc_txt = '''\ + numpy.multivariate_normal(mean, cov, shape=None) + + Draw values from a multivariate normal distribution with specified + mean and covariance. + + The multivariate normal or Gaussian distribution is a generalisation + of the one-dimensional normal distribution to higher dimensions. + + Parameters + ---------- + mean : (N,) ndarray + Mean of the N-dimensional distribution. + + .. math:: + + (1+2+3)/3 + + cov : (N,N) ndarray + Covariance matrix of the distribution. + shape : tuple of ints + Given a shape of, for example, (m,n,k), m*n*k samples are + generated, and packed in an m-by-n-by-k arrangement. Because + each sample is N-dimensional, the output shape is (m,n,k,N). + + Returns + ------- + out : ndarray + The drawn samples, arranged according to `shape`. If the + shape given is (m,n,...), then the shape of `out` is is + (m,n,...,N). + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + + Warnings + -------- + Certain warnings apply. + + Notes + ----- + + Instead of specifying the full covariance matrix, popular + approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements only on the diagonal) + + This geometrical property can be seen in two dimensions by plotting + generated data-points: + + >>> mean = [0,0] + >>> cov = [[1,0],[0,100]] # diagonal covariance, points lie on x or y-axis + + >>> x,y = multivariate_normal(mean,cov,5000).T + >>> plt.plot(x,y,'x'); plt.axis('equal'); plt.show() + + Note that the covariance matrix must be symmetric and non-negative + definite. + + References + ---------- + .. [1] A. Papoulis, "Probability, Random Variables, and Stochastic + Processes," 3rd ed., McGraw-Hill Companies, 1991 + .. [2] R.O. Duda, P.E. Hart, and D.G. Stork, "Pattern Classification," + 2nd ed., Wiley, 2001. + + See Also + -------- + some, other, funcs + otherfunc : relationship + + Examples + -------- + >>> mean = (1,2) + >>> cov = [[1,0],[1,0]] + >>> x = multivariate_normal(mean,cov,(3,3)) + >>> print x.shape + (3, 3, 2) + + The following is probably true, given that 0.6 is roughly twice the + standard deviation: + + >>> print list( (x[0,0,:] - mean) < 0.6 ) + [True, True] + + .. index:: random + :refguide: random;distributions, random;gauss + + ''' +doc = NumpyDocString(doc_txt) + + +def test_signature(): + assert doc['Signature'].startswith('numpy.multivariate_normal(') + assert doc['Signature'].endswith('shape=None)') + +def test_summary(): + assert doc['Summary'][0].startswith('Draw values') + assert doc['Summary'][-1].endswith('covariance.') + +def test_extended_summary(): + assert doc['Extended Summary'][0].startswith('The multivariate normal') + +def test_parameters(): + assert_equal(len(doc['Parameters']), 3) + assert_equal([n for n,_,_ in doc['Parameters']], ['mean','cov','shape']) + + arg, arg_type, desc = doc['Parameters'][1] + assert_equal(arg_type, '(N,N) ndarray') + assert desc[0].startswith('Covariance matrix') + assert doc['Parameters'][0][-1][-2] == ' (1+2+3)/3' + +def test_returns(): + assert_equal(len(doc['Returns']), 1) + arg, arg_type, desc = doc['Returns'][0] + assert_equal(arg, 'out') + assert_equal(arg_type, 'ndarray') + assert desc[0].startswith('The drawn samples') + assert desc[-1].endswith('distribution.') + +def test_notes(): + assert doc['Notes'][0].startswith('Instead') + assert doc['Notes'][-1].endswith('definite.') + assert_equal(len(doc['Notes']), 17) + +def test_references(): + assert doc['References'][0].startswith('..') + assert doc['References'][-1].endswith('2001.') + +def test_examples(): + assert doc['Examples'][0].startswith('>>>') + assert doc['Examples'][-1].endswith('True]') + +def test_index(): + assert_equal(doc['index']['default'], 'random') + print doc['index'] + assert_equal(len(doc['index']), 2) + assert_equal(len(doc['index']['refguide']), 2) + +def non_blank_line_by_line_compare(a,b): + a = [l for l in a.split('\n') if l.strip()] + b = [l for l in b.split('\n') if l.strip()] + for n,line in enumerate(a): + if not line == b[n]: + raise AssertionError("Lines %s of a and b differ: " + "\n>>> %s\n<<< %s\n" % + (n,line,b[n])) +def test_str(): + non_blank_line_by_line_compare(str(doc), +"""numpy.multivariate_normal(mean, cov, shape=None) + +Draw values from a multivariate normal distribution with specified +mean and covariance. + +The multivariate normal or Gaussian distribution is a generalisation +of the one-dimensional normal distribution to higher dimensions. + +Parameters +---------- +mean : (N,) ndarray + Mean of the N-dimensional distribution. + + .. math:: + + (1+2+3)/3 + +cov : (N,N) ndarray + Covariance matrix of the distribution. +shape : tuple of ints + Given a shape of, for example, (m,n,k), m*n*k samples are + generated, and packed in an m-by-n-by-k arrangement. Because + each sample is N-dimensional, the output shape is (m,n,k,N). + +Returns +------- +out : ndarray + The drawn samples, arranged according to `shape`. If the + shape given is (m,n,...), then the shape of `out` is is + (m,n,...,N). + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + +Warnings +-------- +Certain warnings apply. + +See Also +-------- +`some`_, `other`_, `funcs`_ + +`otherfunc`_ + relationship + +Notes +----- +Instead of specifying the full covariance matrix, popular +approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements only on the diagonal) + +This geometrical property can be seen in two dimensions by plotting +generated data-points: + +>>> mean = [0,0] +>>> cov = [[1,0],[0,100]] # diagonal covariance, points lie on x or y-axis + +>>> x,y = multivariate_normal(mean,cov,5000).T +>>> plt.plot(x,y,'x'); plt.axis('equal'); plt.show() + +Note that the covariance matrix must be symmetric and non-negative +definite. + +References +---------- +.. [1] A. Papoulis, "Probability, Random Variables, and Stochastic + Processes," 3rd ed., McGraw-Hill Companies, 1991 +.. [2] R.O. Duda, P.E. Hart, and D.G. Stork, "Pattern Classification," + 2nd ed., Wiley, 2001. + +Examples +-------- +>>> mean = (1,2) +>>> cov = [[1,0],[1,0]] +>>> x = multivariate_normal(mean,cov,(3,3)) +>>> print x.shape +(3, 3, 2) + +The following is probably true, given that 0.6 is roughly twice the +standard deviation: + +>>> print list( (x[0,0,:] - mean) < 0.6 ) +[True, True] + +.. index:: random + :refguide: random;distributions, random;gauss""") + + +def test_sphinx_str(): + sphinx_doc = SphinxDocString(doc_txt) + non_blank_line_by_line_compare(str(sphinx_doc), +""" +.. index:: random + single: random;distributions, random;gauss + +Draw values from a multivariate normal distribution with specified +mean and covariance. + +The multivariate normal or Gaussian distribution is a generalisation +of the one-dimensional normal distribution to higher dimensions. + +:Parameters: + + **mean** : (N,) ndarray + + Mean of the N-dimensional distribution. + + .. math:: + + (1+2+3)/3 + + **cov** : (N,N) ndarray + + Covariance matrix of the distribution. + + **shape** : tuple of ints + + Given a shape of, for example, (m,n,k), m*n*k samples are + generated, and packed in an m-by-n-by-k arrangement. Because + each sample is N-dimensional, the output shape is (m,n,k,N). + +:Returns: + + **out** : ndarray + + The drawn samples, arranged according to `shape`. If the + shape given is (m,n,...), then the shape of `out` is is + (m,n,...,N). + + In other words, each entry ``out[i,j,...,:]`` is an N-dimensional + value drawn from the distribution. + +.. warning:: + + Certain warnings apply. + +.. seealso:: + + :obj:`some`, :obj:`other`, :obj:`funcs` + + :obj:`otherfunc` + relationship + +.. rubric:: Notes + +Instead of specifying the full covariance matrix, popular +approximations include: + + - Spherical covariance (`cov` is a multiple of the identity matrix) + - Diagonal covariance (`cov` has non-negative elements only on the diagonal) + +This geometrical property can be seen in two dimensions by plotting +generated data-points: + +>>> mean = [0,0] +>>> cov = [[1,0],[0,100]] # diagonal covariance, points lie on x or y-axis + +>>> x,y = multivariate_normal(mean,cov,5000).T +>>> plt.plot(x,y,'x'); plt.axis('equal'); plt.show() + +Note that the covariance matrix must be symmetric and non-negative +definite. + +.. rubric:: References + +.. [1] A. Papoulis, "Probability, Random Variables, and Stochastic + Processes," 3rd ed., McGraw-Hill Companies, 1991 +.. [2] R.O. Duda, P.E. Hart, and D.G. Stork, "Pattern Classification," + 2nd ed., Wiley, 2001. + +.. only:: latex + + [1]_, [2]_ + +.. rubric:: Examples + +>>> mean = (1,2) +>>> cov = [[1,0],[1,0]] +>>> x = multivariate_normal(mean,cov,(3,3)) +>>> print x.shape +(3, 3, 2) + +The following is probably true, given that 0.6 is roughly twice the +standard deviation: + +>>> print list( (x[0,0,:] - mean) < 0.6 ) +[True, True] +""") + + +doc2 = NumpyDocString(""" + Returns array of indices of the maximum values of along the given axis. + + Parameters + ---------- + a : {array_like} + Array to look in. + axis : {None, integer} + If None, the index is into the flattened array, otherwise along + the specified axis""") + +def test_parameters_without_extended_description(): + assert_equal(len(doc2['Parameters']), 2) + +doc3 = NumpyDocString(""" + my_signature(*params, **kwds) + + Return this and that. + """) + +def test_escape_stars(): + signature = str(doc3).split('\n')[0] + assert_equal(signature, 'my_signature(\*params, \*\*kwds)') + +doc4 = NumpyDocString( + """a.conj() + + Return an array with all complex-valued elements conjugated.""") + +def test_empty_extended_summary(): + assert_equal(doc4['Extended Summary'], []) + +doc5 = NumpyDocString( + """ + a.something() + + Raises + ------ + LinAlgException + If array is singular. + + """) + +def test_raises(): + assert_equal(len(doc5['Raises']), 1) + name,_,desc = doc5['Raises'][0] + assert_equal(name,'LinAlgException') + assert_equal(desc,['If array is singular.']) + +def test_see_also(): + doc6 = NumpyDocString( + """ + z(x,theta) + + See Also + -------- + func_a, func_b, func_c + func_d : some equivalent func + foo.func_e : some other func over + multiple lines + func_f, func_g, :meth:`func_h`, func_j, + func_k + :obj:`baz.obj_q` + :class:`class_j`: fubar + foobar + """) + + assert len(doc6['See Also']) == 12 + for func, desc, role in doc6['See Also']: + if func in ('func_a', 'func_b', 'func_c', 'func_f', + 'func_g', 'func_h', 'func_j', 'func_k', 'baz.obj_q'): + assert(not desc) + else: + assert(desc) + + if func == 'func_h': + assert role == 'meth' + elif func == 'baz.obj_q': + assert role == 'obj' + elif func == 'class_j': + assert role == 'class' + else: + assert role is None + + if func == 'func_d': + assert desc == ['some equivalent func'] + elif func == 'foo.func_e': + assert desc == ['some other func over', 'multiple lines'] + elif func == 'class_j': + assert desc == ['fubar', 'foobar'] + +def test_see_also_print(): + class Dummy(object): + """ + See Also + -------- + func_a, func_b + func_c : some relationship + goes here + func_d + """ + pass + + obj = Dummy() + s = str(FunctionDoc(obj, role='func')) + assert(':func:`func_a`, :func:`func_b`' in s) + assert(' some relationship' in s) + assert(':func:`func_d`' in s) + +doc7 = NumpyDocString(""" + + Doc starts on second line. + + """) + +def test_empty_first_line(): + assert doc7['Summary'][0].startswith('Doc starts') + + +def test_no_summary(): + str(SphinxDocString(""" + Parameters + ----------""")) + + +def test_unicode(): + doc = SphinxDocString(""" + öäöäöäöäöåååå + + öäöäöäööäååå + + Parameters + ---------- + ååå : äää + ööö + + Returns + ------- + ååå : ööö + äää + + """) + assert doc['Summary'][0] == u'öäöäöäöäöåååå'.encode('utf-8') + +def test_plot_examples(): + cfg = dict(use_plots=True) + + doc = SphinxDocString(""" + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> plt.plot([1,2,3],[4,5,6]) + >>> plt.show() + """, config=cfg) + assert 'plot::' in str(doc), str(doc) + + doc = SphinxDocString(""" + Examples + -------- + .. plot:: + + import matplotlib.pyplot as plt + plt.plot([1,2,3],[4,5,6]) + plt.show() + """, config=cfg) + assert str(doc).count('plot::') == 1, str(doc) + +def test_class_members(): + + class Dummy(object): + """ + Dummy class. + + """ + def spam(self, a, b): + """Spam\n\nSpam spam.""" + pass + def ham(self, c, d): + """Cheese\n\nNo cheese.""" + pass + + for cls in (ClassDoc, SphinxClassDoc): + doc = cls(Dummy, config=dict(show_class_members=False)) + assert 'Methods' not in str(doc), (cls, str(doc)) + assert 'spam' not in str(doc), (cls, str(doc)) + assert 'ham' not in str(doc), (cls, str(doc)) + + doc = cls(Dummy, config=dict(show_class_members=True)) + assert 'Methods' in str(doc), (cls, str(doc)) + assert 'spam' in str(doc), (cls, str(doc)) + assert 'ham' in str(doc), (cls, str(doc)) + + if cls is SphinxClassDoc: + assert '.. autosummary::' in str(doc), str(doc) diff --git a/doc/sphinxext/traitsdoc.py b/doc/sphinxext/traitsdoc.py new file mode 100755 index 00000000..0fcf2c1c --- /dev/null +++ b/doc/sphinxext/traitsdoc.py @@ -0,0 +1,140 @@ +""" +========= +traitsdoc +========= + +Sphinx extension that handles docstrings in the Numpy standard format, [1] +and support Traits [2]. + +This extension can be used as a replacement for ``numpydoc`` when support +for Traits is required. + +.. [1] http://projects.scipy.org/numpy/wiki/CodingStyleGuidelines#docstring-standard +.. [2] http://code.enthought.com/projects/traits/ + +""" + +import inspect +import os +import pydoc + +import docscrape +import docscrape_sphinx +from docscrape_sphinx import SphinxClassDoc, SphinxFunctionDoc, SphinxDocString + +import numpydoc + +import comment_eater + +class SphinxTraitsDoc(SphinxClassDoc): + def __init__(self, cls, modulename='', func_doc=SphinxFunctionDoc): + if not inspect.isclass(cls): + raise ValueError("Initialise using a class. Got %r" % cls) + self._cls = cls + + if modulename and not modulename.endswith('.'): + modulename += '.' + self._mod = modulename + self._name = cls.__name__ + self._func_doc = func_doc + + docstring = pydoc.getdoc(cls) + docstring = docstring.split('\n') + + # De-indent paragraph + try: + indent = min(len(s) - len(s.lstrip()) for s in docstring + if s.strip()) + except ValueError: + indent = 0 + + for n,line in enumerate(docstring): + docstring[n] = docstring[n][indent:] + + self._doc = docscrape.Reader(docstring) + self._parsed_data = { + 'Signature': '', + 'Summary': '', + 'Description': [], + 'Extended Summary': [], + 'Parameters': [], + 'Returns': [], + 'Raises': [], + 'Warns': [], + 'Other Parameters': [], + 'Traits': [], + 'Methods': [], + 'See Also': [], + 'Notes': [], + 'References': '', + 'Example': '', + 'Examples': '', + 'index': {} + } + + self._parse() + + def _str_summary(self): + return self['Summary'] + [''] + + def _str_extended_summary(self): + return self['Description'] + self['Extended Summary'] + [''] + + def __str__(self, indent=0, func_role="func"): + out = [] + out += self._str_signature() + out += self._str_index() + [''] + out += self._str_summary() + out += self._str_extended_summary() + for param_list in ('Parameters', 'Traits', 'Methods', + 'Returns','Raises'): + out += self._str_param_list(param_list) + out += self._str_see_also("obj") + out += self._str_section('Notes') + out += self._str_references() + out += self._str_section('Example') + out += self._str_section('Examples') + out = self._str_indent(out,indent) + return '\n'.join(out) + +def looks_like_issubclass(obj, classname): + """ Return True if the object has a class or superclass with the given class + name. + + Ignores old-style classes. + """ + t = obj + if t.__name__ == classname: + return True + for klass in t.__mro__: + if klass.__name__ == classname: + return True + return False + +def get_doc_object(obj, what=None, config=None): + if what is None: + if inspect.isclass(obj): + what = 'class' + elif inspect.ismodule(obj): + what = 'module' + elif callable(obj): + what = 'function' + else: + what = 'object' + if what == 'class': + doc = SphinxTraitsDoc(obj, '', func_doc=SphinxFunctionDoc, config=config) + if looks_like_issubclass(obj, 'HasTraits'): + for name, trait, comment in comment_eater.get_class_traits(obj): + # Exclude private traits. + if not name.startswith('_'): + doc['Traits'].append((name, trait, comment.splitlines())) + return doc + elif what in ('function', 'method'): + return SphinxFunctionDoc(obj, '', config=config) + else: + return SphinxDocString(pydoc.getdoc(obj), config=config) + +def setup(app): + # init numpydoc + numpydoc.setup(app, get_doc_object) + diff --git a/examples/data/SOURCES b/examples/data/SOURCES new file mode 100644 index 00000000..e69de29b diff --git a/examples/finance.py b/examples/finance.py new file mode 100644 index 00000000..639a80af --- /dev/null +++ b/examples/finance.py @@ -0,0 +1,83 @@ +""" +Some examples playing around with yahoo finance data +""" + +from datetime import datetime + +import matplotlib.finance as fin +import numpy as np +from pylab import show + + +from pandas import Index, DataFrame +from pandas.core.datetools import BMonthEnd +from pandas import ols + +startDate = datetime(2008, 1, 1) +endDate = datetime(2009, 9, 1) + +def getQuotes(symbol, start, end): + quotes = fin.quotes_historical_yahoo(symbol, start, end) + dates, open, close, high, low, volume = zip(*quotes) + + data = { + 'open' : open, + 'close' : close, + 'high' : high, + 'low' : low, + 'volume' : volume + } + + dates = Index([datetime.fromordinal(int(d)) for d in dates]) + return DataFrame(data, index=dates) + +msft = getQuotes('MSFT', startDate, endDate) +aapl = getQuotes('AAPL', startDate, endDate) +goog = getQuotes('GOOG', startDate, endDate) +ibm = getQuotes('IBM', startDate, endDate) + +px = DataFrame({'MSFT' : msft['close'], + 'IBM' : ibm['close'], + 'GOOG' : goog['close'], + 'AAPL' : aapl['close']}) +returns = px / px.shift(1) - 1 + +# Select dates + +subIndex = ibm.index[(ibm['close'] > 95) & (ibm['close'] < 100)] +msftOnSameDates = msft.reindex(subIndex) + +# Insert columns + +msft['hi-lo spread'] = msft['high'] - msft['low'] +ibm['hi-lo spread'] = ibm['high'] - ibm['low'] + +# Aggregate monthly + +def toMonthly(frame, how): + offset = BMonthEnd() + + return frame.groupby(offset.rollforward).aggregate(how) + +msftMonthly = toMonthly(msft, np.mean) +ibmMonthly = toMonthly(ibm, np.mean) + +# Statistics + +stdev = DataFrame({ + 'MSFT' : msft.std(), + 'IBM' : ibm.std() +}) + +# Arithmetic + +ratios = ibm / msft + +# Works with different indices + +ratio = ibm / ibmMonthly +monthlyRatio = ratio.reindex(ibmMonthly.index) + +# Ratio relative to past month average + +filledRatio = ibm / ibmMonthly.reindex(ibm.index, method='pad') diff --git a/examples/regressions.py b/examples/regressions.py new file mode 100644 index 00000000..e78ff90a --- /dev/null +++ b/examples/regressions.py @@ -0,0 +1,49 @@ +from datetime import datetime +import string + +import numpy as np + +from pandas.core.api import Series, DataFrame, DateRange +from pandas.stats.api import ols + +N = 100 + +start = datetime(2009, 9, 2) +dateRange = DateRange(start, periods=N) + +def makeDataFrame(): + data = DataFrame(np.random.randn(N, 7), + columns=list(string.ascii_uppercase[:7]), + index=dateRange) + + return data + +def makeSeries(): + return Series(np.random.randn(N), index=dateRange) + +#------------------------------------------------------------------------------- +# Standard rolling linear regression + +X = makeDataFrame() +Y = makeSeries() + +model = ols(y=Y, x=X) + +print model + +#------------------------------------------------------------------------------- +# Panel regression + +data = { + 'A' : makeDataFrame(), + 'B' : makeDataFrame(), + 'C' : makeDataFrame() +} + +Y = makeDataFrame() + +panelModel = ols(y=Y, x=data, window=50) + +model = ols(y=Y, x=data) + +print panelModel diff --git a/ez_setup.py b/ez_setup.py new file mode 100644 index 00000000..1ff1d3e7 --- /dev/null +++ b/ez_setup.py @@ -0,0 +1,284 @@ +#!python +"""Bootstrap setuptools installation + +If you want to use setuptools in your package's setup.py, just include this +file in the same directory with it, and add this to the top of your setup.py:: + + from ez_setup import use_setuptools + use_setuptools() + +If you want to require a specific version of setuptools, set a download +mirror, or use an alternate download directory, you can do so by supplying +the appropriate options to ``use_setuptools()``. + +This file can also be run as a script to install or upgrade setuptools. +""" +import sys +DEFAULT_VERSION = "0.6c11" +DEFAULT_URL = "http://pypi.python.org/packages/%s/s/setuptools/" % sys.version[:3] + +md5_data = { + 'setuptools-0.6b1-py2.3.egg': '8822caf901250d848b996b7f25c6e6ca', + 'setuptools-0.6b1-py2.4.egg': 'b79a8a403e4502fbb85ee3f1941735cb', + 'setuptools-0.6b2-py2.3.egg': '5657759d8a6d8fc44070a9d07272d99b', + 'setuptools-0.6b2-py2.4.egg': '4996a8d169d2be661fa32a6e52e4f82a', + 'setuptools-0.6b3-py2.3.egg': 'bb31c0fc7399a63579975cad9f5a0618', + 'setuptools-0.6b3-py2.4.egg': '38a8c6b3d6ecd22247f179f7da669fac', + 'setuptools-0.6b4-py2.3.egg': '62045a24ed4e1ebc77fe039aa4e6f7e5', + 'setuptools-0.6b4-py2.4.egg': '4cb2a185d228dacffb2d17f103b3b1c4', + 'setuptools-0.6c1-py2.3.egg': 'b3f2b5539d65cb7f74ad79127f1a908c', + 'setuptools-0.6c1-py2.4.egg': 'b45adeda0667d2d2ffe14009364f2a4b', + 'setuptools-0.6c10-py2.3.egg': 'ce1e2ab5d3a0256456d9fc13800a7090', + 'setuptools-0.6c10-py2.4.egg': '57d6d9d6e9b80772c59a53a8433a5dd4', + 'setuptools-0.6c10-py2.5.egg': 'de46ac8b1c97c895572e5e8596aeb8c7', + 'setuptools-0.6c10-py2.6.egg': '58ea40aef06da02ce641495523a0b7f5', + 'setuptools-0.6c11-py2.3.egg': '2baeac6e13d414a9d28e7ba5b5a596de', + 'setuptools-0.6c11-py2.4.egg': 'bd639f9b0eac4c42497034dec2ec0c2b', + 'setuptools-0.6c11-py2.5.egg': '64c94f3bf7a72a13ec83e0b24f2749b2', + 'setuptools-0.6c11-py2.6.egg': 'bfa92100bd772d5a213eedd356d64086', + 'setuptools-0.6c2-py2.3.egg': 'f0064bf6aa2b7d0f3ba0b43f20817c27', + 'setuptools-0.6c2-py2.4.egg': '616192eec35f47e8ea16cd6a122b7277', + 'setuptools-0.6c3-py2.3.egg': 'f181fa125dfe85a259c9cd6f1d7b78fa', + 'setuptools-0.6c3-py2.4.egg': 'e0ed74682c998bfb73bf803a50e7b71e', + 'setuptools-0.6c3-py2.5.egg': 'abef16fdd61955514841c7c6bd98965e', + 'setuptools-0.6c4-py2.3.egg': 'b0b9131acab32022bfac7f44c5d7971f', + 'setuptools-0.6c4-py2.4.egg': '2a1f9656d4fbf3c97bf946c0a124e6e2', + 'setuptools-0.6c4-py2.5.egg': '8f5a052e32cdb9c72bcf4b5526f28afc', + 'setuptools-0.6c5-py2.3.egg': 'ee9fd80965da04f2f3e6b3576e9d8167', + 'setuptools-0.6c5-py2.4.egg': 'afe2adf1c01701ee841761f5bcd8aa64', + 'setuptools-0.6c5-py2.5.egg': 'a8d3f61494ccaa8714dfed37bccd3d5d', + 'setuptools-0.6c6-py2.3.egg': '35686b78116a668847237b69d549ec20', + 'setuptools-0.6c6-py2.4.egg': '3c56af57be3225019260a644430065ab', + 'setuptools-0.6c6-py2.5.egg': 'b2f8a7520709a5b34f80946de5f02f53', + 'setuptools-0.6c7-py2.3.egg': '209fdf9adc3a615e5115b725658e13e2', + 'setuptools-0.6c7-py2.4.egg': '5a8f954807d46a0fb67cf1f26c55a82e', + 'setuptools-0.6c7-py2.5.egg': '45d2ad28f9750e7434111fde831e8372', + 'setuptools-0.6c8-py2.3.egg': '50759d29b349db8cfd807ba8303f1902', + 'setuptools-0.6c8-py2.4.egg': 'cba38d74f7d483c06e9daa6070cce6de', + 'setuptools-0.6c8-py2.5.egg': '1721747ee329dc150590a58b3e1ac95b', + 'setuptools-0.6c9-py2.3.egg': 'a83c4020414807b496e4cfbe08507c03', + 'setuptools-0.6c9-py2.4.egg': '260a2be2e5388d66bdaee06abec6342a', + 'setuptools-0.6c9-py2.5.egg': 'fe67c3e5a17b12c0e7c541b7ea43a8e6', + 'setuptools-0.6c9-py2.6.egg': 'ca37b1ff16fa2ede6e19383e7b59245a', +} + +import sys, os +try: from hashlib import md5 +except ImportError: from md5 import md5 + +def _validate_md5(egg_name, data): + if egg_name in md5_data: + digest = md5(data).hexdigest() + if digest != md5_data[egg_name]: + print >>sys.stderr, ( + "md5 validation of %s failed! (Possible download problem?)" + % egg_name + ) + sys.exit(2) + return data + +def use_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + download_delay=15 +): + """Automatically find/download setuptools and make it available on sys.path + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end with + a '/'). `to_dir` is the directory where setuptools will be downloaded, if + it is not already available. If `download_delay` is specified, it should + be the number of seconds that will be paused before initiating a download, + should one be required. If an older version of setuptools is installed, + this routine will print a message to ``sys.stderr`` and raise SystemExit in + an attempt to abort the calling script. + """ + was_imported = 'pkg_resources' in sys.modules or 'setuptools' in sys.modules + def do_download(): + egg = download_setuptools(version, download_base, to_dir, download_delay) + sys.path.insert(0, egg) + import setuptools; setuptools.bootstrap_install_from = egg + try: + import pkg_resources + except ImportError: + return do_download() + try: + pkg_resources.require("setuptools>="+version); return + except pkg_resources.VersionConflict, e: + if was_imported: + print >>sys.stderr, ( + "The required version of setuptools (>=%s) is not available, and\n" + "can't be installed while this script is running. Please install\n" + " a more recent version first, using 'easy_install -U setuptools'." + "\n\n(Currently using %r)" + ) % (version, e.args[0]) + sys.exit(2) + else: + del pkg_resources, sys.modules['pkg_resources'] # reload ok + return do_download() + except pkg_resources.DistributionNotFound: + return do_download() + +def download_setuptools( + version=DEFAULT_VERSION, download_base=DEFAULT_URL, to_dir=os.curdir, + delay = 15 +): + """Download setuptools from a specified location and return its filename + + `version` should be a valid setuptools version number that is available + as an egg for download under the `download_base` URL (which should end + with a '/'). `to_dir` is the directory where the egg will be downloaded. + `delay` is the number of seconds to pause before an actual download attempt. + """ + import urllib2, shutil + egg_name = "setuptools-%s-py%s.egg" % (version,sys.version[:3]) + url = download_base + egg_name + saveto = os.path.join(to_dir, egg_name) + src = dst = None + if not os.path.exists(saveto): # Avoid repeated downloads + try: + from distutils import log + if delay: + log.warn(""" +--------------------------------------------------------------------------- +This script requires setuptools version %s to run (even to display +help). I will attempt to download it for you (from +%s), but +you may need to enable firewall access for this script first. +I will start the download in %d seconds. + +(Note: if this machine does not have network access, please obtain the file + + %s + +and place it in this directory before rerunning this script.) +---------------------------------------------------------------------------""", + version, download_base, delay, url + ); from time import sleep; sleep(delay) + log.warn("Downloading %s", url) + src = urllib2.urlopen(url) + # Read/write all in one block, so we don't create a corrupt file + # if the download is interrupted. + data = _validate_md5(egg_name, src.read()) + dst = open(saveto,"wb"); dst.write(data) + finally: + if src: src.close() + if dst: dst.close() + return os.path.realpath(saveto) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +def main(argv, version=DEFAULT_VERSION): + """Install or upgrade setuptools and EasyInstall""" + try: + import setuptools + except ImportError: + egg = None + try: + egg = download_setuptools(version, delay=0) + sys.path.insert(0,egg) + from setuptools.command.easy_install import main + return main(list(argv)+[egg]) # we're done here + finally: + if egg and os.path.exists(egg): + os.unlink(egg) + else: + if setuptools.__version__ == '0.0.1': + print >>sys.stderr, ( + "You have an obsolete version of setuptools installed. Please\n" + "remove it from your system entirely before rerunning this script." + ) + sys.exit(2) + + req = "setuptools>="+version + import pkg_resources + try: + pkg_resources.require(req) + except pkg_resources.VersionConflict: + try: + from setuptools.command.easy_install import main + except ImportError: + from easy_install import main + main(list(argv)+[download_setuptools(delay=0)]) + sys.exit(0) # try to force an exit + else: + if argv: + from setuptools.command.easy_install import main + main(argv) + else: + print "Setuptools version",version,"or greater has been installed." + print '(Run "ez_setup.py -U setuptools" to reinstall or upgrade.)' + +def update_md5(filenames): + """Update our built-in md5 registry""" + + import re + + for name in filenames: + base = os.path.basename(name) + f = open(name,'rb') + md5_data[base] = md5(f.read()).hexdigest() + f.close() + + data = [" %r: %r,\n" % it for it in md5_data.items()] + data.sort() + repl = "".join(data) + + import inspect + srcfile = inspect.getsourcefile(sys.modules[__name__]) + f = open(srcfile, 'rb'); src = f.read(); f.close() + + match = re.search("\nmd5_data = {\n([^}]+)}", src) + if not match: + print >>sys.stderr, "Internal error!" + sys.exit(2) + + src = src[:match.start(1)] + repl + src[match.end(1):] + f = open(srcfile,'w') + f.write(src) + f.close() + + +if __name__=='__main__': + if len(sys.argv)>2 and sys.argv[1]=='--md5update': + update_md5(sys.argv[2:]) + else: + main(sys.argv[1:]) + + + + + + diff --git a/fake_pyrex/Pyrex/Distutils/__init__.py b/fake_pyrex/Pyrex/Distutils/__init__.py new file mode 100644 index 00000000..51c8e16b --- /dev/null +++ b/fake_pyrex/Pyrex/Distutils/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/fake_pyrex/Pyrex/Distutils/build_ext.py b/fake_pyrex/Pyrex/Distutils/build_ext.py new file mode 100644 index 00000000..4f846f62 --- /dev/null +++ b/fake_pyrex/Pyrex/Distutils/build_ext.py @@ -0,0 +1 @@ +build_ext = "yes, it's there!" diff --git a/fake_pyrex/Pyrex/__init__.py b/fake_pyrex/Pyrex/__init__.py new file mode 100644 index 00000000..51c8e16b --- /dev/null +++ b/fake_pyrex/Pyrex/__init__.py @@ -0,0 +1 @@ +# work around broken setuptools monkey patching diff --git a/pandas/__init__.py b/pandas/__init__.py new file mode 100644 index 00000000..11a0b427 --- /dev/null +++ b/pandas/__init__.py @@ -0,0 +1,39 @@ +# pylint: disable-msg=W0614,W0401,W0611,W0622 + +__docformat__ = 'restructuredtext' + +from datetime import datetime + +import numpy as np + +try: + import pandas.lib as lib +except Exception: # pragma: no cover + import sys + e = sys.exc_info()[1] # Py25 and Py3 current exception syntax conflict + if 'No module named' in str(e): + raise ImportError('C extensions not built: if you installed already ' + 'verify that you are not importing from the source ' + 'directory') + else: + raise + +from pandas.version import version as __version__ +from pandas.info import __doc__ + +from pandas.core.api import * +from pandas.sparse.api import * +from pandas.stats.api import * +from pandas.tseries.api import * + +from pandas.io.parsers import (read_csv, read_table, read_clipboard, + read_fwf, to_clipboard, ExcelFile, + ExcelWriter) +from pandas.io.pytables import HDFStore +from pandas.util.testing import debug + +from pandas.tools.describe import value_range +from pandas.tools.merge import merge, concat, ordered_merge +from pandas.tools.pivot import pivot_table, crosstab +from pandas.tools.plotting import scatter_matrix +from pandas.tools.tile import cut, qcut diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/compat/scipy.py b/pandas/compat/scipy.py new file mode 100644 index 00000000..9f021a01 --- /dev/null +++ b/pandas/compat/scipy.py @@ -0,0 +1,242 @@ +""" +Shipping functions from SciPy to reduce dependency on having SciPy installed +""" + +import numpy as np + + +def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'): + """ + Calculate the score at the given `per` percentile of the sequence `a`. + + For example, the score at `per=50` is the median. If the desired quantile + lies between two data points, we interpolate between them, according to + the value of `interpolation`. If the parameter `limit` is provided, it + should be a tuple (lower, upper) of two values. Values of `a` outside + this (closed) interval will be ignored. + + The `interpolation_method` parameter supports three values, namely + `fraction` (default), `lower` and `higher`. Interpolation is done only, + if the desired quantile lies between two data points `i` and `j`. For + `fraction`, the result is an interpolated value between `i` and `j`; + for `lower`, the result is `i`, for `higher` the result is `j`. + + Parameters + ---------- + a : ndarray + Values from which to extract score. + per : scalar + Percentile at which to extract score. + limit : tuple, optional + Tuple of two scalars, the lower and upper limits within which to + compute the percentile. + interpolation : {'fraction', 'lower', 'higher'}, optional + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + - fraction: `i + (j - i)*fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + -lower: `i`. + - higher: `j`. + + Returns + ------- + score : float + Score at percentile. + + See Also + -------- + percentileofscore + + Examples + -------- + >>> from scipy import stats + >>> a = np.arange(100) + >>> stats.scoreatpercentile(a, 50) + 49.5 + + """ + # TODO: this should be a simple wrapper around a well-written quantile + # function. GNU R provides 9 quantile algorithms (!), with differing + # behaviour at, for example, discontinuities. + values = np.sort(a, axis=0) + if limit: + values = values[(limit[0] <= values) & (values <= limit[1])] + + idx = per /100. * (values.shape[0] - 1) + if (idx % 1 == 0): + score = values[idx] + else: + if interpolation_method == 'fraction': + score = _interpolate(values[int(idx)], values[int(idx) + 1], + idx % 1) + elif interpolation_method == 'lower': + score = values[np.floor(idx)] + elif interpolation_method == 'higher': + score = values[np.ceil(idx)] + else: + raise ValueError("interpolation_method can only be 'fraction', " \ + "'lower' or 'higher'") + + return score + + +def _interpolate(a, b, fraction): + """Returns the point at the given fraction between a and b, where + 'fraction' must be between 0 and 1. + """ + return a + (b - a)*fraction + + +def rankdata(a): + """ + Ranks the data, dealing with ties appropriately. + + Equal values are assigned a rank that is the average of the ranks that + would have been otherwise assigned to all of the values within that set. + Ranks begin at 1, not 0. + + Parameters + ---------- + a : array_like + This array is first flattened. + + Returns + ------- + rankdata : ndarray + An array of length equal to the size of `a`, containing rank scores. + + Examples + -------- + >>> stats.rankdata([0, 2, 2, 3]) + array([ 1. , 2.5, 2.5, 4. ]) + + """ + a = np.ravel(a) + n = len(a) + svec, ivec = fastsort(a) + sumranks = 0 + dupcount = 0 + newarray = np.zeros(n, float) + for i in xrange(n): + sumranks += i + dupcount += 1 + if i==n-1 or svec[i] != svec[i+1]: + averank = sumranks / float(dupcount) + 1 + for j in xrange(i-dupcount+1,i+1): + newarray[ivec[j]] = averank + sumranks = 0 + dupcount = 0 + return newarray + + +def fastsort(a): + """ + Sort an array and provide the argsort. + + Parameters + ---------- + a : array_like + Input array. + + Returns + ------- + fastsort : ndarray of type int + sorted indices into the original array + + """ + # TODO: the wording in the docstring is nonsense. + it = np.argsort(a) + as_ = a[it] + return as_, it + + +def percentileofscore(a, score, kind='rank'): + ''' + The percentile rank of a score relative to a list of scores. + + A `percentileofscore` of, for example, 80% means that 80% of the + scores in `a` are below the given score. In the case of gaps or + ties, the exact definition depends on the optional keyword, `kind`. + + Parameters + ---------- + a: array like + Array of scores to which `score` is compared. + score: int or float + Score that is compared to the elements in `a`. + kind: {'rank', 'weak', 'strict', 'mean'}, optional + This optional parameter specifies the interpretation of the + resulting score: + + - "rank": Average percentage ranking of score. In case of + multiple matches, average the percentage rankings of + all matching scores. + - "weak": This kind corresponds to the definition of a cumulative + distribution function. A percentileofscore of 80% + means that 80% of values are less than or equal + to the provided score. + - "strict": Similar to "weak", except that only values that are + strictly less than the given score are counted. + - "mean": The average of the "weak" and "strict" scores, often used in + testing. See + + http://en.wikipedia.org/wiki/Percentile_rank + + Returns + ------- + pcos : float + Percentile-position of score (0-100) relative to `a`. + + Examples + -------- + Three-quarters of the given values lie below a given score: + + >>> percentileofscore([1, 2, 3, 4], 3) + 75.0 + + With multiple matches, note how the scores of the two matches, 0.6 + and 0.8 respectively, are averaged: + + >>> percentileofscore([1, 2, 3, 3, 4], 3) + 70.0 + + Only 2/5 values are strictly less than 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') + 40.0 + + But 4/5 values are less than or equal to 3: + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') + 80.0 + + The average between the weak and the strict scores is + + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') + 60.0 + + ''' + a = np.array(a) + n = len(a) + + if kind == 'rank': + if not(np.any(a == score)): + a = np.append(a, score) + a_len = np.array(range(len(a))) + else: + a_len = np.array(range(len(a))) + 1.0 + + a = np.sort(a) + idx = [a == score] + pct = (np.mean(a_len[idx]) / n) * 100.0 + return pct + + elif kind == 'strict': + return sum(a < score) / float(n) * 100 + elif kind == 'weak': + return sum(a <= score) / float(n) * 100 + elif kind == 'mean': + return (sum(a < score) + sum(a <= score)) * 50 / float(n) + else: + raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") diff --git a/pandas/core/__init__.py b/pandas/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py new file mode 100644 index 00000000..2334c47e --- /dev/null +++ b/pandas/core/algorithms.py @@ -0,0 +1,320 @@ +""" +Generic data algorithms. This module is experimental at the moment and not +intended for public consumption +""" + +import numpy as np + +import pandas.core.common as com +import pandas.lib as lib +import pandas._algos as _algos + +def match(to_match, values, na_sentinel=-1): + """ + Compute locations of to_match into values + + Parameters + ---------- + to_match : array-like + values to find positions of + values : array-like + Unique set of values + na_sentinel : int, default -1 + Value to mark "not found" + + Examples + -------- + + Returns + ------- + match : ndarray of integers + """ + values = np.asarray(values) + if issubclass(values.dtype.type, basestring): + values = np.array(values, dtype='O') + + f = lambda htype, caster: _match_generic(to_match, values, htype, caster) + return _hashtable_algo(f, values.dtype) + +def unique(values): + """ + Compute unique values (not necessarily sorted) efficiently from input array + of values + + Parameters + ---------- + values : array-like + + Returns + ------- + uniques + """ + values = com._asarray_tuplesafe(values) + f = lambda htype, caster: _unique_generic(values, htype, caster) + return _hashtable_algo(f, values.dtype) + + +def count(values, uniques=None): + f = lambda htype, caster: _count_generic(values, htype, caster) + + if uniques is not None: + raise NotImplementedError + else: + return _hashtable_algo(f, values.dtype) + +def _hashtable_algo(f, dtype): + """ + f(HashTable, type_caster) -> result + """ + if com.is_float_dtype(dtype): + return f(lib.Float64HashTable, com._ensure_float64) + elif com.is_integer_dtype(dtype): + return f(lib.Int64HashTable, com._ensure_int64) + else: + return f(lib.PyObjectHashTable, com._ensure_object) + + +def _count_generic(values, table_type, type_caster): + from pandas.core.series import Series + + values = type_caster(values) + table = table_type(min(len(values), 1000000)) + uniques, labels, counts = table.factorize(values) + + return Series(counts, index=uniques) + +def _match_generic(values, index, table_type, type_caster): + values = type_caster(values) + index = type_caster(index) + table = table_type(min(len(index), 1000000)) + table.map_locations(index) + return table.lookup(values) + +def _unique_generic(values, table_type, type_caster): + values = type_caster(values) + table = table_type(min(len(values), 1000000)) + uniques = table.unique(values) + return type_caster(uniques) + + +def factorize(values, sort=False, order=None, na_sentinel=-1): + """ + Encode input values as an enumerated type or categorical variable + + Parameters + ---------- + values : sequence + sort : + order : + + Returns + ------- + """ + values = np.asarray(values) + is_datetime = com.is_datetime64_dtype(values) + hash_klass, values = _get_data_algo(values, _hashtables) + + uniques = [] + table = hash_klass(len(values)) + labels, counts = table.get_labels(values, uniques, 0, na_sentinel) + + labels = com._ensure_platform_int(labels) + + uniques = com._asarray_tuplesafe(uniques) + if sort and len(counts) > 0: + sorter = uniques.argsort() + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + labels = reverse_indexer.take(labels) + np.putmask(labels, mask, -1) + + uniques = uniques.take(sorter) + counts = counts.take(sorter) + + if is_datetime: + uniques = np.array(uniques, dtype='M8[ns]') + + return labels, uniques, counts + +def value_counts(values, sort=True, ascending=False): + """ + Compute a histogram of the counts of non-null values + + Parameters + ---------- + values : ndarray (1-d) + sort : boolean, default True + Sort by values + ascending : boolean, default False + Sort in ascending order + + Returns + ------- + value_counts : Series + """ + from pandas.core.series import Series + from collections import defaultdict + + values = np.asarray(values) + + if com.is_integer_dtype(values.dtype): + values = com._ensure_int64(values) + keys, counts = lib.value_count_int64(values) + result = Series(counts, index=keys) + else: + counter = defaultdict(lambda: 0) + values = values[com.notnull(values)] + for value in values: + counter[value] += 1 + result = Series(counter) + + if sort: + result.sort() + if not ascending: + result = result[::-1] + + return result + + +def rank(values, axis=0, method='average', na_option='keep', + ascending=True): + """ + + """ + if values.ndim == 1: + f, values = _get_data_algo(values, _rank1d_functions) + ranks = f(values, ties_method=method, ascending=ascending) + elif values.ndim == 2: + f, values = _get_data_algo(values, _rank2d_functions) + ranks = f(values, axis=axis, ties_method=method, + ascending=ascending) + return ranks + +def quantile(x, q, interpolation_method='fraction'): + """ + Compute sample quantile or quantiles of the input array. For example, q=0.5 + computes the median. + + The `interpolation_method` parameter supports three values, namely + `fraction` (default), `lower` and `higher`. Interpolation is done only, + if the desired quantile lies between two data points `i` and `j`. For + `fraction`, the result is an interpolated value between `i` and `j`; + for `lower`, the result is `i`, for `higher` the result is `j`. + + Parameters + ---------- + a : ndarray + Values from which to extract score. + q : scalar or array + Percentile at which to extract score. + interpolation : {'fraction', 'lower', 'higher'}, optional + This optional parameter specifies the interpolation method to use, + when the desired quantile lies between two data points `i` and `j`: + + - fraction: `i + (j - i)*fraction`, where `fraction` is the + fractional part of the index surrounded by `i` and `j`. + -lower: `i`. + - higher: `j`. + + Returns + ------- + score : float + Score at percentile. + + Examplesb + -------- + >>> from scipy import stats + >>> a = np.arange(100) + >>> stats.scoreatpercentile(a, 50) + 49.5 + + """ + x = np.asarray(x) + mask = com.isnull(x) + + x = x[-mask] + + values = np.sort(x) + + def _get_score(at): + if len(values) == 0: + return np.nan + + idx = at * (len(values) - 1) + if (idx % 1 == 0): + score = values[idx] + else: + if interpolation_method == 'fraction': + score = _interpolate(values[int(idx)], values[int(idx) + 1], + idx % 1) + elif interpolation_method == 'lower': + score = values[np.floor(idx)] + elif interpolation_method == 'higher': + score = values[np.ceil(idx)] + else: + raise ValueError("interpolation_method can only be 'fraction', " \ + "'lower' or 'higher'") + + return score + + if np.isscalar(q): + return _get_score(q) + else: + q = np.asarray(q, np.float64) + return _algos.arrmap_float64(q, _get_score) + +def _interpolate(a, b, fraction): + """Returns the point at the given fraction between a and b, where + 'fraction' must be between 0 and 1. + """ + return a + (b - a)*fraction + + +def _get_data_algo(values, func_map): + if com.is_float_dtype(values): + f = func_map['float64'] + values = com._ensure_float64(values) + elif com.is_datetime64_dtype(values): + f = func_map['int64'] + values = values.view('i8') + elif com.is_integer_dtype(values): + f = func_map['int64'] + values = com._ensure_int64(values) + else: + f = func_map['generic'] + values = com._ensure_object(values) + return f, values + +def group_position(*args): + """ + Get group position + """ + from collections import defaultdict + table = defaultdict(int) + + result = [] + for tup in zip(*args): + result.append(table[tup]) + table[tup] += 1 + + return result + + +_rank1d_functions = { + 'float64' : lib.rank_1d_float64, + 'int64' : lib.rank_1d_int64, + 'generic' : lib.rank_1d_generic +} + +_rank2d_functions = { + 'float64' : lib.rank_2d_float64, + 'generic' : lib.rank_2d_generic +} + +_hashtables = { + 'float64' : lib.Float64HashTable, + 'int64' : lib.Int64HashTable, + 'generic' : lib.PyObjectHashTable +} diff --git a/pandas/core/api.py b/pandas/core/api.py new file mode 100644 index 00000000..8cf3b7f4 --- /dev/null +++ b/pandas/core/api.py @@ -0,0 +1,31 @@ + +# pylint: disable=W0614,W0401,W0611 + +import numpy as np + +from pandas.core.algorithms import factorize, match, unique, value_counts + +from pandas.core.common import isnull, notnull, save, load +from pandas.core.categorical import Categorical, Factor +from pandas.core.format import (set_printoptions, reset_printoptions, + set_eng_float_format) +from pandas.core.index import Index, Int64Index, MultiIndex + +from pandas.core.series import Series, TimeSeries +from pandas.core.frame import DataFrame +from pandas.core.panel import Panel +from pandas.core.groupby import groupby +from pandas.core.reshape import (pivot_simple as pivot, get_dummies, + lreshape) + +WidePanel = Panel + +from pandas.tseries.offsets import DateOffset +from pandas.tseries.tools import to_datetime +from pandas.tseries.index import (DatetimeIndex, Timestamp, + date_range, bdate_range) +from pandas.tseries.period import Period, PeriodIndex + +# legacy +from pandas.core.daterange import DateRange # deprecated +import pandas.core.datetools as datetools diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py new file mode 100644 index 00000000..0a331310 --- /dev/null +++ b/pandas/core/categorical.py @@ -0,0 +1,127 @@ +# pylint: disable=E1101,W0232 + +import numpy as np + +from pandas.core.algorithms import factorize +import pandas.core.common as com + + +def _cat_compare_op(op): + def f(self, other): + if isinstance(other, (Categorical, np.ndarray)): + values = np.asarray(self) + f = getattr(values, op) + return f(np.asarray(other)) + else: + if other in self.levels: + i = self.levels.get_loc(other) + return getattr(self.labels, op)(i) + else: + return np.repeat(False, len(self)) + + f.__name__ = op + + return f + +class Categorical(object): + """ + Represents a categorical variable in classic R / S-plus fashion + + Parameters + ---------- + labels : ndarray of integers + levels : Index-like (unique) + + data : array-like + + Returns + ------- + **Attributes** + * labels : ndarray + * levels : ndarray + """ + def __init__(self, labels, levels, name=None): + self.labels = labels + self.levels = levels + self.name = name + + @classmethod + def from_array(cls, data): + try: + labels, levels, _ = factorize(data, sort=True) + except TypeError: + labels, levels, _ = factorize(data, sort=False) + + return Categorical(labels, levels, + name=getattr(data, 'name', None)) + + _levels = None + def _set_levels(self, levels): + from pandas.core.index import _ensure_index + + levels = _ensure_index(levels) + if not levels.is_unique: + raise ValueError('Categorical levels must be unique') + self._levels = levels + + def _get_levels(self): + return self._levels + + levels = property(fget=_get_levels, fset=_set_levels) + + __eq__ = _cat_compare_op('__eq__') + __ne__ = _cat_compare_op('__ne__') + __lt__ = _cat_compare_op('__lt__') + __gt__ = _cat_compare_op('__gt__') + __le__ = _cat_compare_op('__le__') + __ge__ = _cat_compare_op('__ge__') + + def __array__(self, dtype=None): + return com.take_1d(self.levels.values, self.labels) + + def __len__(self): + return len(self.labels) + + def __repr__(self): + temp = 'Categorical: %s\n%s\n%s' + values = np.asarray(self) + levheader = 'Levels (%d): ' % len(self.levels) + levstring = np.array_repr(self.levels, + max_line_width=60) + + indent = ' ' * (levstring.find('[') + len(levheader) + 1) + lines = levstring.split('\n') + levstring = '\n'.join([lines[0]] + [indent + x.lstrip() for x in lines[1:]]) + + return temp % ('' if self.name is None else self.name, + repr(values), levheader + levstring) + + def __getitem__(self, key): + if isinstance(key, (int, np.integer)): + i = self.labels[key] + if i == -1: + return np.nan + else: + return self.levels[i] + else: + return Categorical(self.labels[key], self.levels) + + def equals(self, other): + """ + Returns True if categorical arrays are equal + + Parameters + ---------- + other : Categorical + + Returns + ------- + are_equal : boolean + """ + if not isinstance(other, Categorical): + return False + + return (self.levels.equals(other.levels) and + np.array_equal(self.labels, other.labels)) + +Factor = Categorical diff --git a/pandas/core/common.py b/pandas/core/common.py new file mode 100644 index 00000000..4db15c6c --- /dev/null +++ b/pandas/core/common.py @@ -0,0 +1,928 @@ +""" +Misc tools for implementing data structures +""" +try: + import cPickle as pickle +except ImportError: # pragma: no cover + import pickle + +import itertools + +try: + next +except NameError: # pragma: no cover + # Python < 2.6 + def next(x): + return x.next() + +from numpy.lib.format import read_array, write_array +import numpy as np + +import pandas._algos as _algos +import pandas.lib as lib +from pandas.util import py3compat +import codecs +import csv + +from pandas.util.py3compat import StringIO, BytesIO + +# XXX: HACK for NumPy 1.5.1 to suppress warnings +try: + np.seterr(all='ignore') + np.set_printoptions(suppress=True) +except Exception: # pragma: no cover + pass + +class PandasError(Exception): + pass + +class AmbiguousIndexError(PandasError, KeyError): + pass + +def isnull(obj): + ''' + Replacement for numpy.isnan / -numpy.isfinite which is suitable + for use on object arrays. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + ''' + if lib.isscalar(obj): + return lib.checknull(obj) + + from pandas.core.generic import PandasObject + if isinstance(obj, np.ndarray): + return _isnull_ndarraylike(obj) + elif isinstance(obj, PandasObject): + # TODO: optimize for DataFrame, etc. + return obj.apply(isnull) + elif hasattr(obj, '__array__'): + return _isnull_ndarraylike(obj) + else: + return obj is None + +def _isnull_ndarraylike(obj): + from pandas import Series + values = np.asarray(obj) + + if values.dtype.kind in ('O', 'S'): + # Working around NumPy ticket 1542 + shape = values.shape + result = np.empty(shape, dtype=bool) + vec = lib.isnullobj(values.ravel()) + result[:] = vec.reshape(shape) + + if isinstance(obj, Series): + result = Series(result, index=obj.index, copy=False) + elif values.dtype == np.dtype('M8[ns]'): + # this is the NaT pattern + result = values.view('i8') == lib.iNaT + else: + result = -np.isfinite(obj) + return result + +def notnull(obj): + ''' + Replacement for numpy.isfinite / -numpy.isnan which is suitable + for use on object arrays. + + Parameters + ---------- + arr: ndarray or object value + + Returns + ------- + boolean ndarray or boolean + ''' + res = isnull(obj) + if np.isscalar(res): + return not res + return -res + +def mask_missing(arr, values_to_mask): + """ + Return a masking array of same size/shape as arr + with entries equaling any member of values_to_mask set to True + """ + if not isinstance(values_to_mask, (list, np.ndarray)): + values_to_mask = [values_to_mask] + + try: + values_to_mask = np.array(values_to_mask, dtype=arr.dtype) + except Exception: + values_to_mask = np.array(values_to_mask, dtype=object) + + na_mask = isnull(values_to_mask) + nonna = values_to_mask[-na_mask] + + mask = None + for x in nonna: + if mask is None: + mask = arr == x + else: + mask = mask | (arr == x) + + if na_mask.any(): + if mask is None: + mask = isnull(arr) + else: + mask = mask | isnull(arr) + + return mask + +def _pickle_array(arr): + arr = arr.view(np.ndarray) + + buf = BytesIO() + write_array(buf, arr) + + return buf.getvalue() + +def _unpickle_array(bytes): + arr = read_array(BytesIO(bytes)) + return arr + +def _view_wrapper(f, wrap_dtype, na_override=None): + def wrapper(arr, indexer, out, fill_value=np.nan): + if na_override is not None and np.isnan(fill_value): + fill_value = na_override + view = arr.view(wrap_dtype) + outview = out.view(wrap_dtype) + f(view, indexer, outview, fill_value=fill_value) + return wrapper + + +_take1d_dict = { + 'float64' : _algos.take_1d_float64, + 'int32' : _algos.take_1d_int32, + 'int64' : _algos.take_1d_int64, + 'object' : _algos.take_1d_object, + 'bool' : _view_wrapper(_algos.take_1d_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_1d_int64, np.int64, + na_override=lib.iNaT), +} + +_take2d_axis0_dict = { + 'float64' : _algos.take_2d_axis0_float64, + 'int32' : _algos.take_2d_axis0_int32, + 'int64' : _algos.take_2d_axis0_int64, + 'object' : _algos.take_2d_axis0_object, + 'bool' : _view_wrapper(_algos.take_2d_axis0_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_2d_axis0_int64, np.int64, + na_override=lib.iNaT), +} + +_take2d_axis1_dict = { + 'float64' : _algos.take_2d_axis1_float64, + 'int32' : _algos.take_2d_axis1_int32, + 'int64' : _algos.take_2d_axis1_int64, + 'object' : _algos.take_2d_axis1_object, + 'bool' : _view_wrapper(_algos.take_2d_axis1_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_2d_axis1_int64, np.int64, + na_override=lib.iNaT), +} + +_take2d_multi_dict = { + 'float64' : _algos.take_2d_multi_float64, + 'int32' : _algos.take_2d_multi_int32, + 'int64' : _algos.take_2d_multi_int64, + 'object' : _algos.take_2d_multi_object, + 'bool' : _view_wrapper(_algos.take_2d_multi_bool, np.uint8), + 'datetime64[ns]' : _view_wrapper(_algos.take_2d_multi_int64, np.int64, + na_override=lib.iNaT), +} + +def _get_take2d_function(dtype_str, axis=0): + if axis == 0: + return _take2d_axis0_dict[dtype_str] + elif axis == 1: + return _take2d_axis1_dict[dtype_str] + elif axis == 'multi': + return _take2d_multi_dict[dtype_str] + else: # pragma: no cover + raise ValueError('bad axis: %s' % axis) + +def take_1d(arr, indexer, out=None, fill_value=np.nan): + """ + Specialized Cython take which sets NaN values in one pass + """ + dtype_str = arr.dtype.name + + n = len(indexer) + + indexer = _ensure_int64(indexer) + + out_passed = out is not None + take_f = _take1d_dict.get(dtype_str) + + if dtype_str in ('int32', 'int64', 'bool'): + try: + if out is None: + out = np.empty(n, dtype=arr.dtype) + take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) + except ValueError: + mask = indexer == -1 + if len(arr) == 0: + if not out_passed: + out = np.empty(n, dtype=arr.dtype) + else: + out = ndtake(arr, indexer, out=out) + if mask.any(): + if out_passed: + raise Exception('out with dtype %s does not support NA' % + out.dtype) + out = _maybe_upcast(out) + np.putmask(out, mask, fill_value) + elif dtype_str in ('float64', 'object', 'datetime64[ns]'): + if out is None: + out = np.empty(n, dtype=arr.dtype) + take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) + else: + out = ndtake(arr, indexer, out=out) + mask = indexer == -1 + if mask.any(): + if out_passed: + raise Exception('out with dtype %s does not support NA' % + out.dtype) + out = _maybe_upcast(out) + np.putmask(out, mask, fill_value) + + return out + +def take_2d_multi(arr, row_idx, col_idx, fill_value=np.nan): + + dtype_str = arr.dtype.name + + out_shape = len(row_idx), len(col_idx) + + if dtype_str in ('int32', 'int64', 'bool'): + row_mask = row_idx == -1 + col_mask= col_idx == -1 + needs_masking = row_mask.any() or col_mask.any() + + if needs_masking: + return take_2d_multi(_maybe_upcast(arr), row_idx, col_idx, + fill_value=fill_value) + else: + out = np.empty(out_shape, dtype=arr.dtype) + take_f = _get_take2d_function(dtype_str, axis='multi') + take_f(arr, _ensure_int64(row_idx), + _ensure_int64(col_idx), out=out, + fill_value=fill_value) + return out + elif dtype_str in ('float64', 'object', 'datetime64[ns]'): + out = np.empty(out_shape, dtype=arr.dtype) + take_f = _get_take2d_function(dtype_str, axis='multi') + take_f(arr, _ensure_int64(row_idx), _ensure_int64(col_idx), out=out, + fill_value=fill_value) + return out + else: + return take_2d(take_2d(arr, row_idx, axis=0, fill_value=fill_value), + col_idx, axis=1, fill_value=fill_value) + + +def take_2d(arr, indexer, out=None, mask=None, needs_masking=None, axis=0, + fill_value=np.nan): + """ + Specialized Cython take which sets NaN values in one pass + """ + dtype_str = arr.dtype.name + + out_shape = list(arr.shape) + out_shape[axis] = len(indexer) + out_shape = tuple(out_shape) + + if not isinstance(indexer, np.ndarray): + indexer = np.array(indexer, dtype=np.int64) + + if dtype_str in ('int32', 'int64', 'bool'): + if mask is None: + mask = indexer == -1 + needs_masking = mask.any() + + if needs_masking: + # upcasting may be required + result = ndtake(arr, indexer, axis=axis, out=out) + result = _maybe_mask(result, mask, needs_masking, axis=axis, + out_passed=out is not None, + fill_value=fill_value) + return result + else: + if out is None: + out = np.empty(out_shape, dtype=arr.dtype) + take_f = _get_take2d_function(dtype_str, axis=axis) + take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) + return out + elif dtype_str in ('float64', 'object', 'datetime64[ns]'): + if out is None: + out = np.empty(out_shape, dtype=arr.dtype) + take_f = _get_take2d_function(dtype_str, axis=axis) + take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value) + return out + else: + if mask is None: + mask = indexer == -1 + needs_masking = mask.any() + + # GH #486 + if out is not None and arr.dtype != out.dtype: + arr = arr.astype(out.dtype) + + result = ndtake(arr, indexer, axis=axis, out=out) + result = _maybe_mask(result, mask, needs_masking, axis=axis, + out_passed=out is not None, + fill_value=fill_value) + return result + +def ndtake(arr, indexer, axis=0, out=None): + return arr.take(_ensure_platform_int(indexer), axis=axis, out=out) + +def mask_out_axis(arr, mask, axis, fill_value=np.nan): + indexer = [slice(None)] * arr.ndim + indexer[axis] = mask + + arr[tuple(indexer)] = fill_value + +def take_fast(arr, indexer, mask, needs_masking, axis=0, out=None, + fill_value=np.nan): + if arr.ndim == 2: + return take_2d(arr, indexer, out=out, mask=mask, + needs_masking=needs_masking, + axis=axis, fill_value=fill_value) + indexer = _ensure_platform_int(indexer) + result = ndtake(arr, indexer, axis=axis, out=out) + result = _maybe_mask(result, mask, needs_masking, axis=axis, + out_passed=out is not None, fill_value=fill_value) + return result + +def _maybe_mask(result, mask, needs_masking, axis=0, out_passed=False, + fill_value=np.nan): + if needs_masking: + if out_passed and _need_upcast(result): + raise Exception('incompatible type for NAs') + else: + # a bit spaghettified + result = _maybe_upcast(result) + mask_out_axis(result, mask, axis, fill_value) + return result + +def _maybe_upcast(values): + if issubclass(values.dtype.type, np.integer): + values = values.astype(float) + elif issubclass(values.dtype.type, np.bool_): + values = values.astype(object) + + return values + +def _need_upcast(values): + if issubclass(values.dtype.type, (np.integer, np.bool_)): + return True + return False + +def _interp_wrapper(f, wrap_dtype, na_override=None): + def wrapper(arr, mask, limit=None): + view = arr.view(wrap_dtype) + f(view, mask, limit=limit) + return wrapper + +_pad_1d_datetime = _interp_wrapper(_algos.pad_inplace_int64, np.int64) +_pad_2d_datetime = _interp_wrapper(_algos.pad_2d_inplace_int64, np.int64) +_backfill_1d_datetime = _interp_wrapper(_algos.backfill_inplace_int64, np.int64) +_backfill_2d_datetime = _interp_wrapper(_algos.backfill_2d_inplace_int64, np.int64) + +def pad_1d(values, limit=None, mask=None): + if is_float_dtype(values): + _method = _algos.pad_inplace_float64 + elif is_datetime64_dtype(values): + _method = _pad_1d_datetime + elif values.dtype == np.object_: + _method = _algos.pad_inplace_object + else: # pragma: no cover + raise ValueError('Invalid dtype for padding') + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + _method(values, mask, limit=limit) + +def backfill_1d(values, limit=None, mask=None): + if is_float_dtype(values): + _method = _algos.backfill_inplace_float64 + elif is_datetime64_dtype(values): + _method = _backfill_1d_datetime + elif values.dtype == np.object_: + _method = _algos.backfill_inplace_object + else: # pragma: no cover + raise ValueError('Invalid dtype for padding') + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + _method(values, mask, limit=limit) + +def pad_2d(values, limit=None, mask=None): + if is_float_dtype(values): + _method = _algos.pad_2d_inplace_float64 + elif is_datetime64_dtype(values): + _method = _pad_2d_datetime + elif values.dtype == np.object_: + _method = _algos.pad_2d_inplace_object + else: # pragma: no cover + raise ValueError('Invalid dtype for padding') + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + _method(values, mask, limit=limit) + +def backfill_2d(values, limit=None, mask=None): + if is_float_dtype(values): + _method = _algos.backfill_2d_inplace_float64 + elif is_datetime64_dtype(values): + _method = _backfill_2d_datetime + elif values.dtype == np.object_: + _method = _algos.backfill_2d_inplace_object + else: # pragma: no cover + raise ValueError('Invalid dtype for padding') + + if mask is None: + mask = isnull(values) + mask = mask.view(np.uint8) + + _method(values, mask, limit=limit) + +def _consensus_name_attr(objs): + name = objs[0].name + for obj in objs[1:]: + if obj.name != name: + return None + return name + +#---------------------------------------------------------------------- +# Lots of little utilities + +def _infer_dtype(value): + if isinstance(value, (float, np.floating)): + return np.float_ + elif isinstance(value, (bool, np.bool_)): + return np.bool_ + elif isinstance(value, (int, np.integer)): + return np.int_ + else: + return np.object_ + +def _possibly_cast_item(obj, item, dtype): + chunk = obj[item] + + if chunk.values.dtype != dtype: + if dtype in (np.object_, np.bool_): + obj[item] = chunk.astype(np.object_) + elif not issubclass(dtype, (np.integer, np.bool_)): # pragma: no cover + raise ValueError("Unexpected dtype encountered: %s" % dtype) + +def _is_bool_indexer(key): + if isinstance(key, np.ndarray) and key.dtype == np.object_: + key = np.asarray(key) + + if not lib.is_bool_array(key): + if isnull(key).any(): + raise ValueError('cannot index with vector containing ' + 'NA / NaN values') + return False + return True + elif isinstance(key, np.ndarray) and key.dtype == np.bool_: + return True + elif isinstance(key, list): + try: + return np.asarray(key).dtype == np.bool_ + except TypeError: # pragma: no cover + return False + + return False + +def _default_index(n): + from pandas.core.index import Index + return Index(np.arange(n)) + +def ensure_float(arr): + if issubclass(arr.dtype.type, np.integer): + arr = arr.astype(float) + + return arr + +def _mut_exclusive(arg1, arg2): + if arg1 is not None and arg2 is not None: + raise Exception('mutually exclusive arguments') + elif arg1 is not None: + return arg1 + else: + return arg2 + +def _any_none(*args): + for arg in args: + if arg is None: + return True + return False + +def _all_not_none(*args): + for arg in args: + if arg is None: + return False + return True + +def _try_sort(iterable): + listed = list(iterable) + try: + return sorted(listed) + except Exception: + return listed + +def _count_not_none(*args): + return sum(x is not None for x in args) + +#------------------------------------------------------------------------------ +# miscellaneous python tools + +def rands(n): + """Generates a random alphanumeric string of length *n*""" + from random import Random + import string + return ''.join(Random().sample(string.ascii_letters+string.digits, n)) + +def adjoin(space, *lists): + """ + Glues together two sets of strings using the amount of space requested. + The idea is to prettify. + """ + out_lines = [] + newLists = [] + lengths = [max(map(len, x)) + space for x in lists[:-1]] + + # not the last one + lengths.append(max(map(len, lists[-1]))) + + maxLen = max(map(len, lists)) + for i, lst in enumerate(lists): + nl = [x.ljust(lengths[i]) for x in lst] + nl.extend([' ' * lengths[i]] * (maxLen - len(lst))) + newLists.append(nl) + toJoin = zip(*newLists) + for lines in toJoin: + out_lines.append(_join_unicode(lines)) + return _join_unicode(out_lines, sep='\n') + +def _join_unicode(lines, sep=''): + try: + return sep.join(lines) + except UnicodeDecodeError: + sep = unicode(sep) + return sep.join([x.decode('utf-8') if isinstance(x, str) else x + for x in lines]) + +def iterpairs(seq): + """ + Parameters + ---------- + seq: sequence + + Returns + ------- + iterator returning overlapping pairs of elements + + Example + ------- + >>> iterpairs([1, 2, 3, 4]) + [(1, 2), (2, 3), (3, 4) + """ + # input may not be sliceable + seq_it = iter(seq) + seq_it_next = iter(seq) + _ = next(seq_it_next) + + return itertools.izip(seq_it, seq_it_next) + +def indent(string, spaces=4): + dent = ' ' * spaces + return '\n'.join([dent + x for x in string.split('\n')]) + +def banner(message): + """ + Return 80-char width message declaration with = bars on top and bottom. + """ + bar = '=' * 80 + return '%s\n%s\n%s' % (bar, message, bar) + +class groupby(dict): + """ + A simple groupby different from the one in itertools. + + Does not require the sequence elements to be sorted by keys, + however it is slower. + """ + def __init__(self, seq, key=lambda x:x): + for value in seq: + k = key(value) + self.setdefault(k, []).append(value) + try: + __iter__ = dict.iteritems + except AttributeError: # pragma: no cover + # Python 3 + def __iter__(self): + return iter(dict.items(self)) + +def map_indices_py(arr): + """ + Returns a dictionary with (element, index) pairs for each element in the + given array/list + """ + return dict([(x, i) for i, x in enumerate(arr)]) + +def union(*seqs): + result = set([]) + for seq in seqs: + if not isinstance(seq, set): + seq = set(seq) + result |= seq + return type(seqs[0])(list(result)) + +def difference(a, b): + return type(a)(list(set(a) - set(b))) + +def intersection(*seqs): + result = set(seqs[0]) + for seq in seqs: + if not isinstance(seq, set): + seq = set(seq) + result &= seq + return type(seqs[0])(list(result)) + +def _asarray_tuplesafe(values, dtype=None): + if not isinstance(values, (list, tuple, np.ndarray)): + values = list(values) + + if isinstance(values, list) and dtype in [np.object_, object]: + return lib.list_to_object_array(values) + + result = np.asarray(values, dtype=dtype) + + if issubclass(result.dtype.type, basestring): + result = np.asarray(values, dtype=object) + + if result.ndim == 2: + if isinstance(values, list): + return lib.list_to_object_array(values) + else: + # Making a 1D array that safely contains tuples is a bit tricky + # in numpy, leading to the following + result = np.empty(len(values), dtype=object) + result[:] = values + + return result + +def _index_labels_to_array(labels): + if isinstance(labels, (basestring, tuple)): + labels = [labels] + + if not isinstance(labels, (list, np.ndarray)): + try: + labels = list(labels) + except TypeError: # non-iterable + labels = [labels] + + labels = _asarray_tuplesafe(labels) + + return labels + +def _stringify(col): + # unicode workaround + try: + return unicode(col) + except UnicodeError: + return console_encode(col) + +def _stringify_seq(values): + if any(isinstance(x, unicode) for x in values): + return [_stringify(x) for x in values] + return [str(x) for x in values] + +def _maybe_make_list(obj): + if obj is not None and not isinstance(obj, (tuple, list)): + return [obj] + return obj + +def is_integer(obj): + return isinstance(obj, (int, long, np.integer)) + +def is_float(obj): + return isinstance(obj, (float, np.floating)) + +def is_iterator(obj): + # python 3 generators have __next__ instead of next + return hasattr(obj, 'next') or hasattr(obj, '__next__') + +def is_integer_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype.type + else: + tipo = arr_or_dtype.dtype.type + return (issubclass(tipo, np.integer) and not + issubclass(tipo, np.datetime64)) + +def is_datetime64_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype.type + else: + tipo = arr_or_dtype.dtype.type + return issubclass(tipo, np.datetime64) + +def is_float_dtype(arr_or_dtype): + if isinstance(arr_or_dtype, np.dtype): + tipo = arr_or_dtype.type + else: + tipo = arr_or_dtype.dtype.type + return issubclass(tipo, np.floating) + + +_ensure_float64 = _algos.ensure_float64 +_ensure_int64 = _algos.ensure_int64 +_ensure_int32 = _algos.ensure_int32 +_ensure_platform_int = _algos.ensure_platform_int +_ensure_object = _algos.ensure_object + + +def _astype_nansafe(arr, dtype): + if isinstance(dtype, basestring): + dtype = np.dtype(dtype) + + if issubclass(arr.dtype.type, np.datetime64): + if dtype == object: + return lib.ints_to_pydatetime(arr.view(np.int64)) + elif (np.issubdtype(arr.dtype, np.floating) and + np.issubdtype(dtype, np.integer)): + + if np.isnan(arr).any(): + raise ValueError('Cannot convert NA to integer') + + return arr.astype(dtype) + +def _clean_fill_method(method): + method = method.lower() + if method == 'ffill': + method = 'pad' + if method == 'bfill': + method = 'backfill' + if method not in ['pad', 'backfill']: + msg = ('Invalid fill method. Expecting pad (ffill) or backfill (bfill).' + ' Got %s' % method) + raise ValueError(msg) + return method + +def _all_none(*args): + for arg in args: + if arg is not None: + return False + return True + + +def save(obj, path): + """ + Pickle (serialize) object to input file path + + Parameters + ---------- + obj : any object + path : string + File path + """ + f = open(path, 'wb') + try: + pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) + finally: + f.close() + + +def load(path): + """ + Load pickled pandas object (or any other pickled object) from the specified + file path + + Parameters + ---------- + path : string + File path + + Returns + ------- + unpickled : type of object stored in file + """ + f = open(path, 'rb') + try: + return pickle.load(f) + finally: + f.close() + +def console_encode(value): + if py3compat.PY3 or not isinstance(value, unicode): + return value + + try: + import sys + return value.encode(sys.stdin.encoding, 'replace') + except (AttributeError, TypeError): + return value.encode('ascii', 'replace') + +class UTF8Recoder: + """ + Iterator that reads an encoded stream and reencodes the input to UTF-8 + """ + def __init__(self, f, encoding): + self.reader = codecs.getreader(encoding)(f) + + def __iter__(self): + return self + + def next(self): + return self.reader.next().encode("utf-8") + +def _get_handle(path, mode, encoding=None): + if py3compat.PY3: # pragma: no cover + if encoding: + f = open(path, mode, encoding=encoding) + else: + f = open(path, mode, errors='replace') + else: + f = open(path, mode) + return f + +if py3compat.PY3: # pragma: no cover + def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): + # ignore encoding + return csv.reader(f, dialect=dialect, **kwds) + + def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): + return csv.writer(f, dialect=dialect, **kwds) +else: + class UnicodeReader: + """ + A CSV reader which will iterate over lines in the CSV file "f", + which is encoded in the given encoding. + + On Python 3, this is replaced (below) by csv.reader, which handles + unicode. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + f = UTF8Recoder(f, encoding) + self.reader = csv.reader(f, dialect=dialect, **kwds) + + def next(self): + row = self.reader.next() + return [unicode(s, "utf-8") for s in row] + + def __iter__(self): # pragma: no cover + return self + + class UnicodeWriter: + """ + A CSV writer which will write rows to CSV file "f", + which is encoded in the given encoding. + """ + + def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): + # Redirect output to a queue + self.queue = StringIO() + self.writer = csv.writer(self.queue, dialect=dialect, **kwds) + self.stream = f + self.encoder = codecs.getincrementalencoder(encoding)() + + def writerow(self, row): + row = [x if isinstance(x, basestring) else str(x) for x in row] + self.writer.writerow([s.encode("utf-8") for s in row]) + # Fetch UTF-8 output from the queue ... + data = self.queue.getvalue() + data = data.decode("utf-8") + # ... and reencode it into the target encoding + data = self.encoder.encode(data) + # write to the target stream + self.stream.write(data) + # empty queue + self.queue.truncate(0) + + +_NS_DTYPE = np.dtype('M8[ns]') + +def _concat_compat(to_concat): + if all(x.dtype == _NS_DTYPE for x in to_concat): + # work around NumPy 1.6 bug + new_values = np.concatenate([x.view(np.int64) for x in to_concat]) + return new_values.view(_NS_DTYPE) + else: + return np.concatenate(to_concat) + diff --git a/pandas/core/daterange.py b/pandas/core/daterange.py new file mode 100644 index 00000000..4bf6ee5a --- /dev/null +++ b/pandas/core/daterange.py @@ -0,0 +1,45 @@ +# pylint: disable=E1101,E1103 + +from pandas.core.index import Index +from pandas.tseries.index import DatetimeIndex +import pandas.core.datetools as datetools + + +#----------------------------------------------------------------------------- +# DateRange class + +class DateRange(Index): + + offset = tzinfo = None + + def __new__(cls, start=None, end=None, periods=None, + offset=datetools.bday, time_rule=None, + tzinfo=None, name=None, **kwds): + + import warnings + warnings.warn("DateRange is deprecated, use DatetimeIndex instead", + FutureWarning) + + if time_rule is None: + time_rule = kwds.get('timeRule') + if time_rule is not None: + offset = datetools.get_offset(time_rule) + + return DatetimeIndex(start=start, end=end, + periods=periods, freq=offset, + tzinfo=tzinfo, name=name, **kwds) + + def __setstate__(self, aug_state): + """Necessary for making this object picklable""" + index_state = aug_state[:1] + offset = aug_state[1] + + # for backwards compatibility + if len(aug_state) > 2: + tzinfo = aug_state[2] + else: # pragma: no cover + tzinfo = None + + self.offset = offset + self.tzinfo = tzinfo + Index.__setstate__(self, *index_state) diff --git a/pandas/core/datetools.py b/pandas/core/datetools.py new file mode 100644 index 00000000..c2f83241 --- /dev/null +++ b/pandas/core/datetools.py @@ -0,0 +1,32 @@ +"""A collection of random tools for dealing with dates in Python""" + +from pandas.tseries.tools import * +from pandas.tseries.offsets import * +from pandas.tseries.frequencies import * + + +day = DateOffset() +bday = BDay() +businessDay = bday +monthEnd = MonthEnd() +yearEnd = YearEnd() +yearBegin = YearBegin() +bmonthEnd = BMonthEnd() +businessMonthEnd = bmonthEnd +bquarterEnd = BQuarterEnd() +quarterEnd = QuarterEnd() +byearEnd = BYearEnd() +week = Week() + +# Functions/offsets to roll dates forward +thisMonthEnd = MonthEnd(0) +thisBMonthEnd = BMonthEnd(0) +thisYearEnd = YearEnd(0) +thisYearBegin = YearBegin(0) +thisBQuarterEnd = BQuarterEnd(0) +thisQuarterEnd = QuarterEnd(0) + +# Functions to check where a date lies +isBusinessDay = BDay().onOffset +isMonthEnd = MonthEnd().onOffset +isBMonthEnd = BMonthEnd().onOffset diff --git a/pandas/core/format.py b/pandas/core/format.py new file mode 100644 index 00000000..776a0fd8 --- /dev/null +++ b/pandas/core/format.py @@ -0,0 +1,862 @@ +from itertools import izip + +try: + from StringIO import StringIO +except: + from io import StringIO + +from pandas.core.common import adjoin, isnull, notnull, _stringify +from pandas.core.index import MultiIndex, _ensure_index +from pandas.util import py3compat + +import pandas.core.common as com +import pandas.lib as lib + +import numpy as np + +docstring_to_string = """ + Parameters + ---------- + frame : DataFrame + object to render + buf : StringIO-like, optional + buffer to write to + columns : sequence, optional + the subset of columns to write; default None writes all columns + col_space : int, optional + the width of each columns + header : bool, optional + whether to print column labels, default True + index : bool, optional + whether to print index (row) labels, default True + na_rep : string, optional + string representation of NAN to use, default 'NaN' + formatters : list or dict of one-parameter functions, optional + formatter functions to apply to columns' elements by position or name, + default None + float_format : one-parameter function, optional + formatter function to apply to columns' elements if they are floats + default None + sparsify : bool, optional + Set to False for a DataFrame with a hierarchical index to print every + multiindex key at each row, default True + justify : {'left', 'right'}, default None + Left or right-justify the column labels. If None uses the option from + the configuration in pandas.core.common, 'left' out of the box + index_names : bool, optional + Prints the names of the indexes, default True + force_unicode : bool, default False + Always return a unicode result + + Returns + ------- + formatted : string (or unicode, depending on data and options)""" + +class SeriesFormatter(object): + + def __init__(self, series, buf=None, header=True, length=True, + na_rep='NaN', name=False, float_format=None): + self.series = series + self.buf = buf if buf is not None else StringIO() + self.name = name + self.na_rep = na_rep + self.length = length + self.header = header + + if float_format is None: + float_format = print_config.float_format + self.float_format = float_format + + def _get_footer(self): + footer = '' + + if self.name: + if getattr(self.series.index, 'freq', None): + footer += 'Freq: %s' % self.series.index.freqstr + + if footer and self.series.name: + footer += ', ' + footer += ("Name: %s" % str(self.series.name) + if self.series.name else '') + + if self.length: + if footer: + footer += ', ' + footer += 'Length: %d' % len(self.series) + return footer + + def _get_formatted_index(self): + index = self.series.index + is_multi = isinstance(index, MultiIndex) + if is_multi: + have_header = any(name for name in index.names) + fmt_index = index.format(names=True) + else: + have_header = index.name is not None + fmt_index = index.format(name=True) + return fmt_index, have_header + + def _get_formatted_values(self): + return format_array(self.series.values, None, + float_format=self.float_format, + na_rep=self.na_rep) + + def to_string(self): + series = self.series + + if len(series) == 0: + return '' + + fmt_index, have_header = self._get_formatted_index() + fmt_values = self._get_formatted_values() + + maxlen = max(len(x) for x in fmt_index) + pad_space = min(maxlen, 60) + + result = ['%s %s'] * len(fmt_values) + for i, (k, v) in enumerate(izip(fmt_index[1:], fmt_values)): + try: + idx = k.ljust(pad_space + _encode_diff(k)) + except UnicodeEncodeError: + idx = k.ljust(pad_space) + result[i] = result[i] % (idx, v) + + if self.header and have_header: + result.insert(0, fmt_index[0]) + + footer = self._get_footer() + if footer: + result.append(footer) + + return '\n'.join(result) + +if py3compat.PY3: # pragma: no cover + _encode_diff = lambda x: 0 +else: + def _encode_diff(x): + return len(x) - len(x.decode('utf-8')) + +class DataFrameFormatter(object): + """ + Render a DataFrame + + self.to_string() : console-friendly tabular output + self.to_html() : html table + + """ + + __doc__ += docstring_to_string + + def __init__(self, frame, buf=None, columns=None, col_space=None, + header=True, index=True, na_rep='NaN', formatters=None, + justify=None, float_format=None, sparsify=True, + index_names=True, **kwds): + self.frame = frame + self.buf = buf if buf is not None else StringIO() + self.show_index_names = index_names + self.sparsify = sparsify + self.float_format = float_format + self.formatters = formatters if formatters is not None else {} + self.na_rep = na_rep + self.col_space = col_space + self.header = header + self.index = index + + if justify is None: + self.justify = print_config.colheader_justify + else: + self.justify = justify + + self.kwds = kwds + + if columns is not None: + self.columns = _ensure_index(columns) + self.frame = self.frame[self.columns] + else: + self.columns = frame.columns + + def to_string(self, force_unicode=False): + """ + Render a DataFrame to a console-friendly tabular output. + """ + frame = self.frame + + to_write = [] + + if len(frame.columns) == 0 or len(frame.index) == 0: + info_line = (u'Empty %s\nColumns: %s\nIndex: %s' + % (type(self.frame).__name__, + frame.columns, frame.index)) + to_write.append(info_line) + else: + # may include levels names also + str_index = self._get_formatted_index() + str_columns = self._get_formatted_column_labels() + + stringified = [] + + for i, c in enumerate(self.columns): + if self.header: + fmt_values = self._format_col(i) + cheader = str_columns[i] + max_len = max(max(len(x) for x in fmt_values), + max(len(x) for x in cheader)) + if self.justify == 'left': + cheader = [x.ljust(max_len) for x in cheader] + else: + cheader = [x.rjust(max_len) for x in cheader] + fmt_values = cheader + fmt_values + stringified.append(_make_fixed_width(fmt_values, + self.justify)) + else: + stringified = [_make_fixed_width(self._format_col(i), + self.justify) + for i, c in enumerate(self.columns)] + + if self.index: + to_write.append(adjoin(1, str_index, *stringified)) + else: + to_write.append(adjoin(1, *stringified)) + + if not py3compat.PY3: + if force_unicode: + to_write = [unicode(s) for s in to_write] + else: + # generally everything is plain strings, which has ascii + # encoding. problem is when there is a char with value over 127 + # - everything then gets converted to unicode. + try: + for s in to_write: + str(s) + except UnicodeError: + to_write = [unicode(s) for s in to_write] + + self.buf.writelines(to_write) + + def _format_col(self, i): + col = self.columns[i] + formatter = self.formatters.get(col) + return format_array(self.frame.icol(i).values, formatter, + float_format=self.float_format, + na_rep=self.na_rep, + space=self.col_space) + + def to_html(self): + """ + Render a DataFrame to a html table. + """ + def _str(x): + if not isinstance(x, basestring): + return str(x) + return x + + elements = [] + def write(s, indent=0): + elements.append(' ' * indent + _str(s)) + + + def write_th(s, indent=0): + write('%s' % _str(s), indent) + + def write_td(s, indent=0): + write('%s' % _str(s), indent) + + def write_tr(l, indent=0, indent_delta=4, header=False): + write('', indent) + indent += indent_delta + if header: + for s in l: + write_th(s, indent) + else: + for s in l: + write_td(s, indent) + indent -= indent_delta + write('', indent) + + indent = 0 + indent_delta = 2 + frame = self.frame + + write('', indent) + + def _column_header(): + row = [''] * (frame.index.nlevels - 1) + + if isinstance(self.columns, MultiIndex): + if self.has_column_names: + row.append(single_column_table(self.columns.names)) + else: + row.append('') + row.extend([single_column_table(c) for c in self.columns]) + else: + row.append(self.columns.name or '') + row.extend(self.columns) + return row + + if len(frame.columns) == 0 or len(frame.index) == 0: + write('', indent + indent_delta) + write_tr([repr(frame.index), + 'Empty %s' % type(self.frame).__name__], + indent + (2 * indent_delta), + indent_delta) + write('', indent + indent_delta) + else: + indent += indent_delta + + # header row + if self.header: + write('', indent) + row = [] + + col_row = _column_header() + indent += indent_delta + write_tr(col_row, indent, indent_delta, header=True) + if self.has_index_names: + row = frame.index.names + [''] * len(self.columns) + write_tr(row, indent, indent_delta, header=True) + + indent -= indent_delta + write('', indent) + + write('', indent) + indent += indent_delta + + _bold_row = self.kwds.get('bold_rows', False) + def _maybe_bold_row(x): + temp = '%s' + if _bold_row: + return ([temp % y for y in x] if isinstance(x, tuple) + else temp % x) + else: + return x + + fmt_values = {} + for i in range(len(self.columns)): + fmt_values[i] = self._format_col(i) + + # write values + for i in range(len(frame)): + row = [] + if isinstance(frame.index, MultiIndex): + row.extend(_maybe_bold_row(frame.index[i])) + else: + row.append(_maybe_bold_row(frame.index[i])) + for j in range(len(self.columns)): + row.append(fmt_values[j][i]) + write_tr(row, indent, indent_delta) + indent -= indent_delta + write('', indent) + indent -= indent_delta + + write('
', indent) + + _put_lines(self.buf, elements) + + def _get_formatted_column_labels(self): + from pandas.core.index import _sparsify + + def is_numeric_dtype(dtype): + return issubclass(dtype.type, np.number) + + if isinstance(self.columns, MultiIndex): + fmt_columns = self.columns.format(sparsify=False, adjoin=False) + fmt_columns = zip(*fmt_columns) + dtypes = self.frame.dtypes.values + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = zip(*[[' ' + y + if y not in self.formatters and need_leadsp[x] + else y for y in x] + for x in fmt_columns]) + if self.sparsify: + str_columns = _sparsify(str_columns) + + str_columns = [list(x) for x in zip(*str_columns)] + else: + fmt_columns = self.columns.format() + dtypes = self.frame.dtypes + need_leadsp = dict(zip(fmt_columns, map(is_numeric_dtype, dtypes))) + str_columns = [[' ' + x + if col not in self.formatters and need_leadsp[x] + else x] + for col, x in zip(self.columns, fmt_columns)] + + if self.show_index_names and self.has_index_names: + for x in str_columns: + x.append('') + + return str_columns + + @property + def has_index_names(self): + return _has_names(self.frame.index) + + @property + def has_column_names(self): + return _has_names(self.frame.columns) + + def _get_formatted_index(self): + index = self.frame.index + columns = self.frame.columns + + show_index_names = self.show_index_names and self.has_index_names + show_col_names = (self.show_index_names and self.has_column_names) + + if isinstance(index, MultiIndex): + fmt_index = index.format(sparsify=self.sparsify, adjoin=False, + names=show_index_names) + else: + fmt_index = [index.format(name=show_index_names)] + + adjoined = adjoin(1, *fmt_index).split('\n') + + # empty space for columns + if show_col_names: + col_header = ['%s' % x for x in self._get_column_name_list()] + else: + col_header = [''] * columns.nlevels + + if self.header: + return col_header + adjoined + else: + return adjoined + + def _get_column_name_list(self): + names = [] + columns = self.frame.columns + if isinstance(columns, MultiIndex): + names.extend('' if name is None else name + for name in columns.names) + else: + names.append('' if columns.name is None else columns.name) + return names + +#---------------------------------------------------------------------- +# Array formatters + + +def format_array(values, formatter, float_format=None, na_rep='NaN', + digits=None, space=None, justify='right'): + if com.is_float_dtype(values.dtype): + fmt_klass = FloatArrayFormatter + elif com.is_integer_dtype(values.dtype): + fmt_klass = IntArrayFormatter + elif com.is_datetime64_dtype(values.dtype): + fmt_klass = Datetime64Formatter + else: + fmt_klass = GenericArrayFormatter + + if space is None: + space = print_config.column_space + + if float_format is None: + float_format = print_config.float_format + + if digits is None: + digits = print_config.precision + + fmt_obj = fmt_klass(values, digits, na_rep=na_rep, + float_format=float_format, + formatter=formatter, space=space, + justify=justify) + + return fmt_obj.get_result() + + +class GenericArrayFormatter(object): + + def __init__(self, values, digits=7, formatter=None, na_rep='NaN', + space=12, float_format=None, justify='right'): + self.values = values + self.digits = digits + self.na_rep = na_rep + self.space = space + self.formatter = formatter + self.float_format = float_format + self.justify = justify + + def get_result(self): + if self._have_unicode(): + fmt_values = self._format_strings(use_unicode=True) + else: + fmt_values = self._format_strings(use_unicode=False) + + return _make_fixed_width(fmt_values, self.justify) + + def _have_unicode(self): + mask = lib.map_infer(self.values, lambda x: isinstance(x, unicode)) + return mask.any() + + def _format_strings(self, use_unicode=False): + if self.float_format is None: + float_format = print_config.float_format + if float_format is None: + fmt_str = '%% .%dg' % print_config.precision + float_format = lambda x: fmt_str % x + else: + float_format = self.float_format + + if use_unicode: + formatter = _stringify if self.formatter is None else self.formatter + else: + formatter = str if self.formatter is None else self.formatter + + def _format(x): + if self.na_rep is not None and lib.checknull(x): + if x is None: + return 'None' + return self.na_rep + else: + # object dtype + return '%s' % formatter(x) + + vals = self.values + + is_float = lib.map_infer(vals, com.is_float) & notnull(vals) + leading_space = is_float.any() + + fmt_values = [] + for i, v in enumerate(vals): + if not is_float[i] and leading_space: + fmt_values.append(' %s' % _format(v)) + elif is_float[i]: + fmt_values.append(float_format(v)) + else: + fmt_values.append(' %s' % _format(v)) + + return fmt_values + +class FloatArrayFormatter(GenericArrayFormatter): + """ + + """ + + def __init__(self, *args, **kwargs): + GenericArrayFormatter.__init__(self, *args, **kwargs) + + if self.float_format is not None and self.formatter is None: + self.formatter = self.float_format + + def _format_with(self, fmt_str): + fmt_values = [fmt_str % x if notnull(x) else self.na_rep + for x in self.values] + return _trim_zeros(fmt_values, self.na_rep) + + def get_result(self): + if self.formatter is not None: + fmt_values = [self.formatter(x) for x in self.values] + else: + fmt_str = '%% .%df' % (self.digits - 1) + fmt_values = self._format_with(fmt_str) + + if len(fmt_values) > 0: + maxlen = max(len(x) for x in fmt_values) + else: + maxlen =0 + + too_long = maxlen > self.digits + 5 + + # this is pretty arbitrary for now + has_large_values = (np.abs(self.values) > 1e8).any() + + if too_long and has_large_values: + fmt_str = '%% .%de' % (self.digits - 1) + fmt_values = self._format_with(fmt_str) + + return _make_fixed_width(fmt_values, self.justify) + + +class IntArrayFormatter(GenericArrayFormatter): + + def get_result(self): + if self.formatter: + formatter = self.formatter + else: + formatter = lambda x: '% d' % x + + fmt_values = [formatter(x) for x in self.values] + + return _make_fixed_width(fmt_values, self.justify) + + +class Datetime64Formatter(GenericArrayFormatter): + + def get_result(self): + if self.formatter: + formatter = self.formatter + else: + formatter = _format_datetime64 + + fmt_values = [formatter(x) for x in self.values] + return _make_fixed_width(fmt_values, self.justify) + +def _format_datetime64(x, tz=None): + if isnull(x): + return 'NaT' + + stamp = lib.Timestamp(x, tz=tz) + return stamp._repr_base + + +def _make_fixed_width(strings, justify='right'): + if len(strings) == 0: + return strings + + max_len = max(len(x) for x in strings) + conf_max = print_config.max_colwidth + if conf_max is not None and max_len > conf_max: + max_len = conf_max + + if justify == 'left': + justfunc = lambda self, x: self.ljust(x) + else: + justfunc = lambda self, x: self.rjust(x) + + def just(x): + return justfunc(x[:max_len], max_len) + + return [just(x) for x in strings] + +def _trim_zeros(str_floats, na_rep='NaN'): + """ + Trims zeros and decimal points + """ + # TODO: what if exponential? + trimmed = str_floats + + def _cond(values): + non_na = [x for x in values if x != na_rep] + return len(non_na) > 0 and all([x.endswith('0') for x in non_na]) + + while _cond(trimmed): + trimmed = [x[:-1] if x != na_rep else x for x in trimmed] + + # trim decimal points + return [x[:-1] if x.endswith('.') and x != na_rep else x for x in trimmed] + + +def single_column_table(column): + table = '' + for i in column: + table += ('' % str(i)) + table += '
%s
' + return table + +def single_row_table(row): # pragma: no cover + table = '' + for i in row: + table += ('' % str(i)) + table += '
%s
' + return table + +def _has_names(index): + if isinstance(index, MultiIndex): + return any([x is not None for x in index.names]) + else: + return index.name is not None + + + +#------------------------------------------------------------------------------- +# Global formatting options + +def set_printoptions(precision=None, column_space=None, max_rows=None, + max_columns=None, colheader_justify=None, + max_colwidth=None, notebook_repr_html=None, + date_dayfirst=None, date_yearfirst=None): + """ + Alter default behavior of DataFrame.toString + + precision : int + Floating point output precision (number of significant digits). This is + only a suggestion + column_space : int + Default space for DataFrame columns, defaults to 12 + max_rows : int + max_columns : int + max_rows and max_columns are used in __repr__() methods to decide if + to_string() or info() is used to render an object to a string. + Either one, or both can be set to 0 (experimental). Pandas will figure + out how big the terminal is and will not display more rows or/and + columns that can fit on it. + colheader_justify + notebook_repr_html : boolean + When True (default), IPython notebook will use html representation for + pandas objects (if it is available). + date_dayfirst : boolean + When True, prints and parses dates with the day first, eg 20/01/2005 + date_yearfirst : boolean + When True, prints and parses dates with the year first, eg 2005/01/20 + """ + if precision is not None: + print_config.precision = precision + if column_space is not None: + print_config.column_space = column_space + if max_rows is not None: + print_config.max_rows = max_rows + if max_colwidth is not None: + print_config.max_colwidth = max_colwidth + if max_columns is not None: + print_config.max_columns = max_columns + if colheader_justify is not None: + print_config.colheader_justify = colheader_justify + if notebook_repr_html is not None: + print_config.notebook_repr_html = notebook_repr_html + if date_dayfirst is not None: + print_config.date_dayfirst = date_dayfirst + if date_yearfirst is not None: + print_config.date_yearfirst = date_yearfirst + +def reset_printoptions(): + print_config.reset() + +class EngFormatter(object): + """ + Formats float values according to engineering format. + + Based on matplotlib.ticker.EngFormatter + """ + + # The SI engineering prefixes + ENG_PREFIXES = { + -24: "y", + -21: "z", + -18: "a", + -15: "f", + -12: "p", + -9: "n", + -6: "u", + -3: "m", + 0: "", + 3: "k", + 6: "M", + 9: "G", + 12: "T", + 15: "P", + 18: "E", + 21: "Z", + 24: "Y" + } + + def __init__(self, accuracy=None, use_eng_prefix=False): + self.accuracy = accuracy + self.use_eng_prefix = use_eng_prefix + + def __call__(self, num): + """ Formats a number in engineering notation, appending a letter + representing the power of 1000 of the original number. Some examples: + + >>> format_eng(0) # for self.accuracy = 0 + ' 0' + + >>> format_eng(1000000) # for self.accuracy = 1, + # self.use_eng_prefix = True + ' 1.0M' + + >>> format_eng("-1e-6") # for self.accuracy = 2 + # self.use_eng_prefix = False + '-1.00E-06' + + @param num: the value to represent + @type num: either a numeric value or a string that can be converted to + a numeric value (as per decimal.Decimal constructor) + + @return: engineering formatted string + """ + import decimal + import math + dnum = decimal.Decimal(str(num)) + + sign = 1 + + if dnum < 0: # pragma: no cover + sign = -1 + dnum = -dnum + + if dnum != 0: + pow10 = decimal.Decimal(int(math.floor(dnum.log10()/3)*3)) + else: + pow10 = decimal.Decimal(0) + + pow10 = pow10.min(max(self.ENG_PREFIXES.keys())) + pow10 = pow10.max(min(self.ENG_PREFIXES.keys())) + int_pow10 = int(pow10) + + if self.use_eng_prefix: + prefix = self.ENG_PREFIXES[int_pow10] + else: + if int_pow10 < 0: + prefix = 'E-%02d' % (-int_pow10) + else: + prefix = 'E+%02d' % int_pow10 + + mant = sign*dnum/(10**pow10) + + if self.accuracy is None: # pragma: no cover + format_str = u"% g%s" + else: + format_str = (u"%% .%if%%s" % self.accuracy ) + + formatted = format_str % (mant, prefix) + + return formatted #.strip() + +def set_eng_float_format(precision=None, accuracy=3, use_eng_prefix=False): + """ + Alter default behavior on how float is formatted in DataFrame. + Format float in engineering format. By accuracy, we mean the number of + decimal digits after the floating point. + + See also EngFormatter. + """ + if precision is not None: # pragma: no cover + import warnings + warnings.warn("'precision' parameter in set_eng_float_format is " + "being renamed to 'accuracy'" , FutureWarning) + accuracy = precision + + print_config.float_format = EngFormatter(accuracy, use_eng_prefix) + print_config.column_space = max(12, accuracy + 9) + + +class _GlobalPrintConfig(object): + """ + Holds the console formatting settings for DataFrame and friends + """ + + def __init__(self): + self.precision = self.digits = 7 + self.float_format = None + self.column_space = 12 + self.max_rows = 200 + self.max_colwidth = 50 + self.max_columns = 0 + self.colheader_justify = 'right' + self.notebook_repr_html = True + self.date_dayfirst = False + self.date_yearfirst = False + + def reset(self): + self.__init__() + +print_config = _GlobalPrintConfig() + + +def _put_lines(buf, lines): + if any(isinstance(x, unicode) for x in lines): + lines = [unicode(x) for x in lines] + buf.write('\n'.join(lines)) + + +if __name__ == '__main__': + arr = np.array([746.03, 0.00, 5620.00, 1592.36]) + # arr = np.array([11111111.1, 1.55]) + # arr = [314200.0034, 1.4125678] + arr = np.array([ 327763.3119, 345040.9076, 364460.9915, 398226.8688, + 383800.5172, 433442.9262, 539415.0568, 568590.4108, + 599502.4276, 620921.8593, 620898.5294, 552427.1093, + 555221.2193, 519639.7059, 388175.7 , 379199.5854, + 614898.25 , 504833.3333, 560600. , 941214.2857, + 1134250. , 1219550. , 855736.85 , 1042615.4286, + 722621.3043, 698167.1818, 803750. ]) + fmt = FloatArrayFormatter(arr, digits=7) + print fmt.get_result() diff --git a/pandas/core/frame.py b/pandas/core/frame.py new file mode 100644 index 00000000..9a3c4730 --- /dev/null +++ b/pandas/core/frame.py @@ -0,0 +1,4922 @@ +from __future__ import with_statement + +""" +DataFrame +--------- +An efficient 2D container for potentially mixed-type time series or other +labeled data series. + +Similar to its R counterpart, data.frame, except providing automatic data +alignment and a host of useful data manipulation methods having to do with the +labeling information +""" + +# pylint: disable=E1101,E1103 +# pylint: disable=W0212,W0231,W0703,W0622 + +from itertools import izip +from StringIO import StringIO +import csv +import operator +import sys + +from numpy import nan +import numpy as np +import numpy.ma as ma + +from pandas.core.common import (isnull, notnull, PandasError, _try_sort, + _default_index, _stringify) +from pandas.core.generic import NDFrame +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels +from pandas.core.internals import BlockManager, make_block, form_blocks +from pandas.core.series import Series, _radd_compat +from pandas.compat.scipy import scoreatpercentile as _quantile +from pandas.util import py3compat +from pandas.util.terminal import get_terminal_size +from pandas.util.decorators import deprecate, Appender, Substitution + +from pandas.tseries.period import PeriodIndex + +import pandas.core.algorithms as algos +import pandas.core.datetools as datetools +import pandas.core.common as com +import pandas.core.format as fmt +import pandas.core.generic as generic +import pandas.core.nanops as nanops +import pandas.lib as lib + +#---------------------------------------------------------------------- +# Docstring templates + +_arith_doc = """ +Binary operator %s with support to substitute a fill_value for missing data in +one of the inputs + +Parameters +---------- +other : Series, DataFrame, or constant +axis : {0, 1, 'index', 'columns'} + For Series input, axis to match Series index on +fill_value : None or float value, default None + Fill missing (NaN) values with this value. If both DataFrame locations are + missing, the result will be missing +level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + +Notes +----- +Mismatched indices will be unioned together + +Returns +------- +result : DataFrame +""" + + +_stat_doc = """ +Return %(name)s over requested axis. +%(na_action)s + +Parameters +---------- +axis : {0, 1} + 0 for row-wise, 1 for column-wise +skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA +level : int, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame +%(extras)s +Returns +------- +%(shortname)s : Series (or DataFrame if level specified) +""" + +_doc_exclude_na = "NA/null values are excluded" + +_numeric_only_doc = """numeric_only : boolean, default None + Include only float, int, boolean data. If None, will attempt to use + everything, then use only numeric data +""" + +_merge_doc = """ +Merge DataFrame objects by performing a database-style join operation by +columns or indexes. + +If joining columns on columns, the DataFrame indexes *will be +ignored*. Otherwise if joining indexes on indexes or indexes on a column or +columns, the index will be passed on. + +Parameters +----------%s +right : DataFrame +how : {'left', 'right', 'outer', 'inner'}, default 'inner' + * left: use only keys from left frame (SQL: left outer join) + * right: use only keys from right frame (SQL: right outer join) + * outer: use union of keys from both frames (SQL: full outer join) + * inner: use intersection of keys from both frames (SQL: inner join) +on : label or list + Field names to join on. Must be found in both DataFrames. +left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns +right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs +left_index : boolean, default True + Use the index from the left DataFrame as the join key(s). If it is a + MultiIndex, the number of keys in the other DataFrame (either the index + or a number of columns) must match the number of levels +right_index : boolean, default True + Use the index from the right DataFrame as the join key. Same caveats as + left_index +sort : boolean, default True + Sort the join keys lexicographically in the result DataFrame +suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively +copy : boolean, default True + If False, do not copy data unnecessarily + +Examples +-------- + +>>> A >>> B + lkey value rkey value +0 foo 1 0 foo 5 +1 bar 2 1 bar 6 +2 baz 3 2 qux 7 +3 foo 4 3 bar 8 + +>>> merge(A, B, left_on='lkey', right_on='rkey', how='outer') + lkey value_x rkey value_y +0 bar 2 bar 6 +1 bar 2 bar 8 +2 baz 3 NaN NaN +3 foo 1 foo 5 +4 foo 4 foo 5 +5 NaN NaN qux 7 + +Returns +------- +merged : DataFrame +""" + +# Custom error class for update + +class DataConflictError(Exception): pass + +#---------------------------------------------------------------------- +# Factory helper methods + +def _arith_method(op, name, default_axis='columns'): + def na_op(x, y): + try: + result = op(x, y) + except TypeError: + xrav = x.ravel() + result = np.empty(x.size, dtype=x.dtype) + if isinstance(y, np.ndarray): + yrav = y.ravel() + mask = notnull(xrav) & notnull(yrav) + result[mask] = op(xrav[mask], yrav[mask]) + else: + mask = notnull(xrav) + result[mask] = op(xrav[mask], y) + + np.putmask(result, -mask, np.nan) + result = result.reshape(x.shape) + + return result + + @Appender(_arith_doc % name) + def f(self, other, axis=default_axis, level=None, fill_value=None): + if isinstance(other, DataFrame): # Another DataFrame + return self._combine_frame(other, na_op, fill_value, level) + elif isinstance(other, Series): + return self._combine_series(other, na_op, fill_value, axis, level) + elif isinstance(other, (list, tuple)): + if axis is not None and self._get_axis_name(axis) == 'index': + casted = Series(other, index=self.index) + else: + casted = Series(other, index=self.columns) + return self._combine_series(casted, na_op, fill_value, axis, level) + elif isinstance(other, np.ndarray): + if other.ndim == 1: + if axis is not None and self._get_axis_name(axis) == 'index': + casted = Series(other, index=self.index) + else: + casted = Series(other, index=self.columns) + return self._combine_series(casted, na_op, fill_value, + axis, level) + elif other.ndim == 2: + casted = DataFrame(other, index=self.index, + columns=self.columns) + return self._combine_frame(casted, na_op, fill_value, level) + else: # pragma: no cover + raise ValueError("Bad argument shape") + else: + return self._combine_const(other, na_op) + + f.__name__ = name + + return f + +def _flex_comp_method(op, name, default_axis='columns'): + + def na_op(x, y): + try: + result = op(x, y) + except TypeError: + xrav = x.ravel() + result = np.empty(x.size, dtype=x.dtype) + if isinstance(y, np.ndarray): + yrav = y.ravel() + mask = notnull(xrav) & notnull(yrav) + result[mask] = op(np.array(list(xrav[mask])), + np.array(list(yrav[mask]))) + else: + mask = notnull(xrav) + result[mask] = op(np.array(list(xrav[mask])), y) + + if op == operator.ne: # pragma: no cover + np.putmask(result, -mask, True) + else: + np.putmask(result, -mask, False) + result = result.reshape(x.shape) + + return result + + @Appender('Wrapper for flexible comparison methods %s' % name) + def f(self, other, axis=default_axis, level=None): + if isinstance(other, DataFrame): # Another DataFrame + return self._flex_compare_frame(other, na_op, level) + + elif isinstance(other, Series): + return self._combine_series(other, na_op, None, axis, level) + + elif isinstance(other, (list, tuple)): + if axis is not None and self._get_axis_name(axis) == 'index': + casted = Series(other, index=self.index) + else: + casted = Series(other, index=self.columns) + + return self._combine_series(casted, na_op, None, axis, level) + + elif isinstance(other, np.ndarray): + if other.ndim == 1: + if axis is not None and self._get_axis_name(axis) == 'index': + casted = Series(other, index=self.index) + else: + casted = Series(other, index=self.columns) + + return self._combine_series(casted, na_op, None, axis, level) + + elif other.ndim == 2: + casted = DataFrame(other, index=self.index, + columns=self.columns) + + return self._flex_compare_frame(casted, na_op, level) + + else: # pragma: no cover + raise ValueError("Bad argument shape") + + else: + return self._combine_const(other, na_op) + + f.__name__ = name + + return f + + +def _comp_method(func, name): + @Appender('Wrapper for comparison method %s' % name) + def f(self, other): + if isinstance(other, DataFrame): # Another DataFrame + return self._compare_frame(other, func) + elif isinstance(other, Series): + return self._combine_series_infer(other, func) + else: + return self._combine_const(other, func) + + f.__name__ = name + + return f + + +#---------------------------------------------------------------------- +# DataFrame class + + +class DataFrame(NDFrame): + _auto_consolidate = True + _verbose_info = True + _het_axis = 1 + + _AXIS_NUMBERS = { + 'index': 0, + 'columns': 1 + } + + _AXIS_NAMES = dict((v, k) for k, v in _AXIS_NUMBERS.iteritems()) + + def __init__(self, data=None, index=None, columns=None, dtype=None, + copy=False): + """Two-dimensional size-mutable, potentially heterogeneous tabular data + structure with labeled axes (rows and columns). Arithmetic operations + align on both row and column labels. Can be thought of as a dict-like + container for Series objects. The primary pandas data structure + + Parameters + ---------- + data : numpy ndarray (structured or homogeneous), dict, or DataFrame + Dict can contain Series, arrays, constants, or list-like objects + index : Index or array-like + Index to use for resulting frame. Will default to np.arange(n) if + no indexing information part of input data and no index provided + columns : Index or array-like + Will default to np.arange(n) if not column labels provided + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + + Examples + -------- + >>> d = {'col1': ts1, 'col2': ts2} + >>> df = DataFrame(data=d, index=index) + >>> df2 = DataFrame(np.random.randn(10, 5)) + >>> df3 = DataFrame(np.random.randn(10, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + + See also + -------- + DataFrame.from_records: constructor from tuples, also record arrays + DataFrame.from_dict: from dicts of Series, arrays, or dicts + DataFrame.from_csv: from CSV files + DataFrame.from_items: from sequence of (key, value) pairs + read_csv / read_table / read_clipboard + """ + if data is None: + data = {} + + if isinstance(data, DataFrame): + data = data._data + + if isinstance(data, BlockManager): + mgr = self._init_mgr(data, index, columns, dtype=dtype, copy=copy) + elif isinstance(data, dict): + mgr = self._init_dict(data, index, columns, dtype=dtype) + elif isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + datacopy = ma.copy(data) + if issubclass(data.dtype.type, np.datetime64): + datacopy[mask] = lib.iNaT + else: + datacopy = com._maybe_upcast(datacopy) + datacopy[mask] = np.nan + mgr = self._init_ndarray(datacopy, index, columns, dtype=dtype, + copy=copy) + elif isinstance(data, np.ndarray): + if data.dtype.names: + data_columns, data = _rec_to_dict(data) + if columns is None: + columns = data_columns + mgr = self._init_dict(data, index, columns, dtype=dtype) + else: + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + elif isinstance(data, list): + if len(data) > 0: + if index is None and isinstance(data[0], Series): + index = _get_names_from_index(data) + + if isinstance(data[0], (list, tuple, dict, Series)): + conv_data, columns = _to_sdict(data, columns) + if isinstance(conv_data, dict): + if len(conv_data) == 0 and index is None: + index = np.arange(len(data)) + mgr = self._init_dict(conv_data, index, columns, + dtype=dtype) + else: + mgr = self._init_ndarray(conv_data, index, columns, + dtype=dtype, copy=copy) + else: + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + else: + mgr = self._init_ndarray(data, index, columns, dtype=dtype, + copy=copy) + else: + raise PandasError('DataFrame constructor not properly called!') + + NDFrame.__init__(self, mgr) + + @classmethod + def _from_axes(cls, data, axes): + # for construction from BlockManager + if isinstance(data, BlockManager): + return cls(data) + else: + columns, index = axes + return cls(data, index=index, columns=columns, copy=False) + + def _init_mgr(self, mgr, index, columns, dtype=None, copy=False): + if columns is not None: + mgr = mgr.reindex_axis(columns, axis=0, copy=False) + if index is not None: + mgr = mgr.reindex_axis(index, axis=1, copy=False) + # do not copy BlockManager unless explicitly done + if copy and dtype is None: + mgr = mgr.copy() + elif dtype is not None: + # no choice but to copy + mgr = mgr.astype(dtype) + return mgr + + def _init_dict(self, data, index, columns, dtype=None): + """ + Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. + """ + # prefilter if columns passed + if columns is not None: + columns = _ensure_index(columns) + data = dict((k, v) for k, v in data.iteritems() if k in columns) + else: + columns = Index(_try_sort(data.keys())) + + # figure out the index, if necessary + if index is None: + index = extract_index(data) + else: + index = _ensure_index(index) + + # don't force copy because getting jammed in an ndarray anyway + homogenized = _homogenize(data, index, columns, dtype) + + # from BlockManager perspective + axes = [columns, index] + + # segregates dtypes and forms blocks matching to columns + blocks = form_blocks(homogenized, axes) + + # consolidate for now + mgr = BlockManager(blocks, axes) + return mgr.consolidate() + + def _init_ndarray(self, values, index, columns, dtype=None, + copy=False): + if isinstance(values, Series): + if columns is None and values.name is not None: + columns = [values.name] + if index is None: + index = values.index + else: + values = values.reindex(index) + + values = _prep_ndarray(values, copy=copy) + + if dtype is not None: + try: + values = values.astype(dtype) + except Exception: + raise ValueError('failed to cast to %s' % dtype) + + N, K = values.shape + + if index is None: + index = _default_index(N) + else: + index = _ensure_index(index) + + if columns is None: + columns = _default_index(K) + else: + columns = _ensure_index(columns) + + block = make_block(values.T, columns, columns) + return BlockManager([block], [columns, index]) + + def _wrap_array(self, arr, axes, copy=False): + index, columns = axes + return self._constructor(arr, index=index, columns=columns, copy=copy) + + @property + def axes(self): + return [self.index, self.columns] + + @property + def _constructor(self): + return DataFrame + + # Fancy indexing + _ix = None + + @property + def ix(self): + if self._ix is None: + self._ix = _NDFrameIndexer(self) + + return self._ix + + @property + def shape(self): + return (len(self.index), len(self.columns)) + + #---------------------------------------------------------------------- + # Class behavior + + @property + def empty(self): + return not (len(self.columns) > 0 and len(self.index) > 0) + + def __nonzero__(self): + raise ValueError("Cannot call bool() on DataFrame.") + + def _need_info_repr_(self): + """ + Check if it is needed to use info/summary view to represent a + particular DataFrame. + """ + config = fmt.print_config + + terminal_width, terminal_height = get_terminal_size() + max_rows = (terminal_height if config.max_rows == 0 + else config.max_rows) + max_columns = config.max_columns + + if max_columns > 0: + if len(self.index) <= max_rows and \ + len(self.columns) <= max_columns: + return False + else: + return True + else: + # save us + if (len(self.index) > max_rows or + len(self.columns) > terminal_width // 2): + return True + else: + buf = StringIO() + self.to_string(buf=buf) + value = buf.getvalue() + if max([len(l) for l in value.split('\n')]) > terminal_width: + return True + else: + return False + + def __repr__(self): + """ + Return a string representation for a particular DataFrame + """ + buf = StringIO() + if self._need_info_repr_(): + self.info(buf=buf, verbose=self._verbose_info) + else: + self.to_string(buf=buf) + value = buf.getvalue() + return com.console_encode(value) + + def _repr_html_(self): + """ + Return a html representation for a particular DataFrame. + Mainly for IPython notebook. + """ + if fmt.print_config.notebook_repr_html: + if self._need_info_repr_(): + return None + else: + return ('
\n' + + self.to_html() + '\n
') + else: + return None + + def __iter__(self): + """ + Iterate over columns of the frame. + """ + return iter(self.columns) + + def keys(self): + return self.columns + + def iteritems(self): + """Iterator over (column, series) pairs""" + return ((k, self[k]) for k in self.columns) + + def iterrows(self): + """ + Iterate over rows of DataFrame as (index, Series) pairs + """ + columns = self.columns + for k, v in izip(self.index, self.values): + s = v.view(Series) + s.index = columns + s.name = k + yield k, s + + def itertuples(self, index=True): + """ + Iterate over rows of DataFrame as tuples, with index value + as first element of the tuple + """ + arrays = [] + if index: + arrays.append(self.index) + arrays.extend(self[k] for k in self.columns) + return izip(*arrays) + + iterkv = iteritems + if py3compat.PY3: # pragma: no cover + items = iteritems + + def __len__(self): + """Returns length of index""" + return len(self.index) + + def __contains__(self, key): + """True if DataFrame has this column""" + return key in self.columns + + #---------------------------------------------------------------------- + # Arithmetic methods + + add = _arith_method(operator.add, 'add') + mul = _arith_method(operator.mul, 'multiply') + sub = _arith_method(operator.sub, 'subtract') + div = divide = _arith_method(lambda x, y: x / y, 'divide') + + radd = _arith_method(_radd_compat, 'radd') + rmul = _arith_method(operator.mul, 'rmultiply') + rsub = _arith_method(lambda x, y: y - x, 'rsubtract') + rdiv = _arith_method(lambda x, y: y / x, 'rdivide') + + __add__ = _arith_method(operator.add, '__add__', default_axis=None) + __sub__ = _arith_method(operator.sub, '__sub__', default_axis=None) + __mul__ = _arith_method(operator.mul, '__mul__', default_axis=None) + __truediv__ = _arith_method(operator.truediv, '__truediv__', + default_axis=None) + __floordiv__ = _arith_method(operator.floordiv, '__floordiv__', + default_axis=None) + __pow__ = _arith_method(operator.pow, '__pow__', default_axis=None) + + __radd__ = _arith_method(_radd_compat, '__radd__', default_axis=None) + __rmul__ = _arith_method(operator.mul, '__rmul__', default_axis=None) + __rsub__ = _arith_method(lambda x, y: y - x, '__rsub__', default_axis=None) + __rtruediv__ = _arith_method(lambda x, y: y / x, '__rtruediv__', + default_axis=None) + __rfloordiv__ = _arith_method(lambda x, y: y // x, '__rfloordiv__', + default_axis=None) + __rpow__ = _arith_method(lambda x, y: y ** x, '__rpow__', + default_axis=None) + + # boolean operators + __and__ = _arith_method(operator.and_, '__and__') + __or__ = _arith_method(operator.or_, '__or__') + __xor__ = _arith_method(operator.xor, '__xor__') + + # Python 2 division methods + if not py3compat.PY3: + __div__ = _arith_method(operator.div, '__div__', default_axis=None) + __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__', + default_axis=None) + + def __neg__(self): + arr = operator.neg(self.values) + return self._wrap_array(arr, self.axes, copy=False) + + # Comparison methods + __eq__ = _comp_method(operator.eq, '__eq__') + __ne__ = _comp_method(operator.ne, '__ne__') + __lt__ = _comp_method(operator.lt, '__lt__') + __gt__ = _comp_method(operator.gt, '__gt__') + __le__ = _comp_method(operator.le, '__le__') + __ge__ = _comp_method(operator.ge, '__ge__') + + eq = _flex_comp_method(operator.eq, 'eq') + ne = _flex_comp_method(operator.ne, 'ne') + gt = _flex_comp_method(operator.gt, 'gt') + lt = _flex_comp_method(operator.lt, 'lt') + ge = _flex_comp_method(operator.ge, 'ge') + le = _flex_comp_method(operator.le, 'le') + + def dot(self, other): + """ + Matrix multiplication with DataFrame objects. Does no data alignment + + Parameters + ---------- + other : DataFrame + + Returns + ------- + dot_product : DataFrame + """ + lvals = self.values + rvals = other.values + result = np.dot(lvals, rvals) + return DataFrame(result, index=self.index, columns=other.columns) + + #---------------------------------------------------------------------- + # IO methods (to / from other formats) + + @classmethod + def from_dict(cls, data, orient='columns', dtype=None): + """ + Construct DataFrame from dict of array-like or dicts + + Parameters + ---------- + data : dict + {field : array-like} or {field : dict} + orient : {'columns', 'index'}, default 'columns' + The "orientation" of the data. If the keys of the passed dict + should be the columns of the resulting DataFrame, pass 'columns' + (default). Otherwise if the keys should be rows, pass 'index'. + + Returns + ------- + DataFrame + """ + from collections import defaultdict + + orient = orient.lower() + if orient == 'index': + # TODO: this should be seriously cythonized + new_data = defaultdict(dict) + for index, s in data.iteritems(): + for col, v in s.iteritems(): + new_data[col][index] = v + data = new_data + elif orient != 'columns': # pragma: no cover + raise ValueError('only recognize index or columns for orient') + + return DataFrame(data, dtype=dtype) + + def to_dict(self, outtype='dict'): + """ + Convert DataFrame to dictionary. + + Parameters + ---------- + outtype : str {'dict', 'list', 'series'} + Determines the type of the values of the dictionary. The + default `dict` is a nested dictionary {column -> {index -> value}}. + `list` returns {column -> list(values)}. `series` returns + {column -> Series(values)}. + Abbreviations are allowed. + + + Returns + ------- + result : dict like {column -> {index -> value}} + """ + if outtype.lower().startswith('d'): + return dict((k, v.to_dict()) for k, v in self.iteritems()) + elif outtype.lower().startswith('l'): + return dict((k, v.tolist()) for k, v in self.iteritems()) + elif outtype.lower().startswith('s'): + return dict((k, v) for k,v in self.iteritems()) + else: # pragma: no cover + raise ValueError("outtype %s not understood" % outtype) + + @classmethod + def from_records(cls, data, index=None, exclude=None, columns=None, + names=None, coerce_float=False): + """ + Convert structured or record ndarray to DataFrame + + Parameters + ---------- + data : ndarray (structured dtype), list of tuples, or DataFrame + index : string, list of fields, array-like + Field of array to use as the index, alternately a specific set of + input labels to use + exclude: sequence, default None + Columns or fields to exclude + columns : sequence, default None + Column names to use, replacing any found in passed data + coerce_float : boolean, default False + Attempt to convert values to non-string, non-numeric objects (like + decimal.Decimal) to floating point, useful for SQL result sets + + Returns + ------- + df : DataFrame + """ + import warnings + + # Make a copy of the input columns so we can modify it + if columns is not None: + columns = list(columns) + + if len(algos.unique(columns)) < len(columns): + raise ValueError('Non-unique columns not yet supported in from_records') + + if names is not None: # pragma: no cover + columns = names + warnings.warn("'names' parameter to DataFrame.from_records is " + "being renamed to 'columns', 'names' will be " + "removed in 0.8.0", + FutureWarning) + + if isinstance(data, (np.ndarray, DataFrame, dict)): + columns, sdict = _rec_to_dict(data) + else: + sdict, columns = _to_sdict(data, columns, + coerce_float=coerce_float) + + if exclude is None: + exclude = set() + else: + exclude = set(exclude) + + for col in exclude: + del sdict[col] + columns.remove(col) + + if index is not None: + if (isinstance(index, basestring) or + not hasattr(index, "__iter__")): + result_index = sdict.pop(index) + columns.remove(index) + else: + try: + arrays = [] + for field in index: + arrays.append(sdict[field]) + for field in index: + del sdict[field] + columns.remove(field) + result_index = MultiIndex.from_arrays(arrays) + except Exception: + result_index = index + elif isinstance(data, dict) and len(data) > 0: + # utilize first element of sdict to get length + result_index = np.arange(len(data.values()[0])) + else: + result_index = np.arange(len(data)) + + return cls(sdict, index=result_index, columns=columns) + + def to_records(self, index=True): + """ + Convert DataFrame to record array. Index will be put in the + 'index' field of the record array if requested + + Parameters + ---------- + index : boolean, default True + Include index in resulting record array, stored in 'index' field + + Returns + ------- + y : recarray + """ + if index: + arrays = [self.index] + [self[c] for c in self.columns] + names = ['index'] + list(map(str, self.columns)) + else: + arrays = [self[c] for c in self.columns] + names = list(map(str, self.columns)) + + return np.rec.fromarrays(arrays, names=names) + + @classmethod + def from_items(cls, items, columns=None, orient='columns'): + """ + Convert (key, value) pairs to DataFrame. The keys will be the axis + index (usually the columns, but depends on the specified + orientation). The values should be arrays or Series + + Parameters + ---------- + items : sequence of (key, value) pairs + Values should be arrays or Series + columns : sequence, optional + Must be passed in the + orient : {'columns', 'index'}, default 'items' + The "orientation" of the data. If the keys of the passed dict + should be the items of the result panel, pass 'items' + (default). Otherwise if the columns of the values of the passed + DataFrame objects should be the items (which in the case of + mixed-dtype data you should do), instead pass 'minor' + + Returns + ------- + frame : DataFrame + """ + keys, values = zip(*items) + + if orient == 'columns': + cols_to_use = columns if columns is not None else keys + # iterable may have been consumed + return DataFrame(dict(zip(keys, values)), columns=cols_to_use) + elif orient == 'index': + if columns is None: + raise ValueError("Must pass columns with orient='index'") + + arr = np.array(values, dtype=object).T + new_data = dict((k, lib.maybe_convert_objects(v)) + for k, v in zip(columns, arr)) + return DataFrame(new_data, index=keys, columns=columns) + elif orient != 'columns': # pragma: no cover + raise ValueError('only recognize index or columns for orient') + + @classmethod + def from_csv(cls, path, header=0, sep=',', index_col=0, + parse_dates=True, encoding=None): + """ + Read delimited file into DataFrame + + Parameters + ---------- + path : string file path or file handle / StringIO + header : int, default 0 + Row to use at header (skip prior rows) + sep : string, default ',' + Field delimiter + index_col : int or sequence, default 0 + Column to use for index. If a sequence is given, a MultiIndex + is used. Different default from read_table + parse_dates : boolean, default True + Parse dates. Different default from read_table + + Notes + ----- + Preferable to use read_table for most general purposes but from_csv + makes for an easy roundtrip to and from file, especially with a + DataFrame of time series data + + Returns + ------- + y : DataFrame + """ + from pandas.io.parsers import read_table + return read_table(path, header=header, sep=sep, + parse_dates=parse_dates, index_col=index_col, + encoding=encoding) + + def to_sparse(self, fill_value=None, kind='block'): + """ + Convert to SparseDataFrame + + Parameters + ---------- + fill_value : float, default NaN + kind : {'block', 'integer'} + + Returns + ------- + y : SparseDataFrame + """ + from pandas.core.sparse import SparseDataFrame + return SparseDataFrame(self._series, index=self.index, + default_kind=kind, + default_fill_value=fill_value) + + def to_panel(self): + """ + Transform long (stacked) format (DataFrame) into wide (3D, Panel) + format. + + Currently the index of the DataFrame must be a 2-level MultiIndex. This + may be generalized later + + Returns + ------- + panel : Panel + """ + from pandas.core.panel import Panel + from pandas.core.reshape import block2d_to_block3d + + # only support this kind for now + assert(isinstance(self.index, MultiIndex) and + len(self.index.levels) == 2) + + self._consolidate_inplace() + + # minor axis must be sorted + if self.index.lexsort_depth < 2: + selfsorted = self.sortlevel(0) + else: + selfsorted = self + + major_axis, minor_axis = selfsorted.index.levels + major_labels, minor_labels = selfsorted.index.labels + + shape = len(major_axis), len(minor_axis) + + new_blocks = [] + for block in selfsorted._data.blocks: + newb = block2d_to_block3d(block.values.T, block.items, shape, + major_labels, minor_labels, + ref_items=selfsorted.columns) + new_blocks.append(newb) + + new_axes = [selfsorted.columns, major_axis, minor_axis] + new_mgr = BlockManager(new_blocks, new_axes) + + return Panel(new_mgr) + + to_wide = deprecate('to_wide', to_panel) + + def _helper_csvexcel(self, writer, na_rep=None, cols=None, + header=True, index=True, index_label=None): + if cols is None: + cols = self.columns + + series = {} + for k, v in self._series.iteritems(): + series[k] = v.values + + has_aliases = isinstance(header, (tuple, list, np.ndarray)) + if has_aliases or header: + if index: + # should write something for index label + if index_label is None: + if isinstance(self.index, MultiIndex): + index_label = [] + for i, name in enumerate(self.index.names): + if name is None: + name = '' + index_label.append(name) + else: + index_label = self.index.name + if index_label is None: + index_label = [''] + else: + index_label = [index_label] + elif not isinstance(index_label, (list, tuple, np.ndarray)): + # given a string for a DF with Index + index_label = [index_label] + + encoded_labels = list(index_label) + if has_aliases: + if len(header) != len(cols): + raise ValueError(('Writing %d cols but got %d aliases' + % (len(cols), len(header)))) + else: + write_cols = header + else: + write_cols = cols + encoded_cols = list(write_cols) + + writer.writerow(encoded_labels + encoded_cols) + else: + encoded_cols = list(cols) + writer.writerow(encoded_cols) + + nlevels = getattr(self.index, 'nlevels', 1) + for j, idx in enumerate(self.index): + row_fields = [] + if index: + if nlevels == 1: + row_fields = [idx] + else: # handle MultiIndex + row_fields = list(idx) + for i, col in enumerate(cols): + val = series[col][j] + if isnull(val): + val = na_rep + + row_fields.append(val) + + writer.writerow(row_fields) + + def to_csv(self, path_or_buf, sep=",", na_rep='', cols=None, + header=True, index=True, index_label=None, + mode='w', nanRep=None, encoding=None): + """ + Write DataFrame to a comma-separated values (csv) file + + Parameters + ---------- + path_or_buf : string or file handle / StringIO + File path + na_rep : string, default '' + Missing data representation + cols : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + mode : Python write mode, default 'w' + sep : character, default "," + Field delimiter for the output file. + encoding : string, optional + a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + """ + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep + + if hasattr(path_or_buf, 'read'): + f = path_or_buf + close = False + else: + f = com._get_handle(path_or_buf, mode, encoding=encoding) + close = True + + try: + if encoding is not None: + csvout = com.UnicodeWriter(f, lineterminator='\n', + delimiter=sep, encoding=encoding) + else: + csvout = csv.writer(f, lineterminator='\n', delimiter=sep) + self._helper_csvexcel(csvout, na_rep=na_rep, cols=cols, + header=header, index=index, + index_label=index_label) + + finally: + if close: + f.close() + + def to_excel(self, excel_writer, sheet_name='sheet1', na_rep='', + cols=None, header=True, index=True, index_label=None): + """ + Write DataFrame to a excel sheet + + Parameters + ---------- + excel_writer : string or ExcelWriter object + File path or existing ExcelWriter + sheet_name : string, default 'sheet1' + Name of sheet which will contain DataFrame + na_rep : string, default '' + Missing data rep'n + cols : sequence, optional + Columns to write + header : boolean or list of string, default True + Write out column names. If a list of string is given it is + assumed to be aliases for the column names + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + + Notes + ----- + If passing an existing ExcelWriter object, then the sheet will be added + to the existing workbook. This can be used to save different + DataFrames to one workbook + >>> writer = ExcelWriter('output.xlsx') + >>> df1.to_excel(writer,'sheet1') + >>> df2.to_excel(writer,'sheet2') + >>> writer.save() + """ + from pandas.io.parsers import ExcelWriter + need_save = False + if isinstance(excel_writer, str): + excel_writer = ExcelWriter(excel_writer) + need_save = True + excel_writer.cur_sheet = sheet_name + self._helper_csvexcel(excel_writer, na_rep=na_rep, cols=cols, + header=header, index=index, + index_label=index_label) + if need_save: + excel_writer.save() + + @Appender(fmt.docstring_to_string, indents=1) + def to_string(self, buf=None, columns=None, col_space=None, colSpace=None, + header=True, index=True, na_rep='NaN', formatters=None, + float_format=None, sparsify=True, nanRep=None, + index_names=True, justify=None, force_unicode=False): + """ + Render a DataFrame to a console-friendly tabular output. + """ + + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", + FutureWarning) + na_rep = nanRep + + if colSpace is not None: # pragma: no cover + import warnings + warnings.warn("colSpace is deprecated, use col_space", + FutureWarning) + col_space = colSpace + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + justify=justify, + index_names=index_names, + header=header, index=index) + formatter.to_string(force_unicode=force_unicode) + + if buf is None: + result = formatter.buf.getvalue() + if not force_unicode: + try: + result = str(result) + except ValueError: + pass + return result + + @Appender(fmt.docstring_to_string, indents=1) + def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, + header=True, index=True, na_rep='NaN', formatters=None, + float_format=None, sparsify=True, index_names=True, + bold_rows=True): + """ + to_html-specific options + bold_rows : boolean, default True + Make the row labels bold in the output + + Render a DataFrame to an html table. + """ + + if colSpace is not None: # pragma: no cover + import warnings + warnings.warn("colSpace is deprecated, use col_space", + FutureWarning) + col_space = colSpace + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + header=header, index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names) + formatter.to_html() + + if buf is None: + return formatter.buf.getvalue() + + def info(self, verbose=True, buf=None): + """ + Concise summary of a DataFrame, used in __repr__ when very large. + + Parameters + ---------- + verbose : boolean, default True + If False, don't print column count summary + buf : writable buffer, defaults to sys.stdout + """ + from pandas.core.format import _put_lines + + if buf is None: # pragma: no cover + buf = sys.stdout + + lines = [] + + lines.append(str(type(self))) + lines.append(self.index.summary()) + + if len(self.columns) == 0: + lines.append('Empty %s' % type(self).__name__) + _put_lines(buf, lines) + return + + cols = self.columns + + # hack + if verbose and len(self.columns) < 100: + lines.append('Data columns:') + space = max([len(_stringify(k)) for k in self.columns]) + 4 + counts = self.count() + assert(len(cols) == len(counts)) + for col, count in counts.iteritems(): + if not isinstance(col, (unicode, str)): + col = str(col) + lines.append(_put_str(col, space) + + '%d non-null values' % count) + else: + lines.append(self.columns.summary(name='Columns')) + + counts = self.get_dtype_counts() + dtypes = ['%s(%d)' % k for k in sorted(counts.iteritems())] + lines.append('dtypes: %s' % ', '.join(dtypes)) + _put_lines(buf, lines) + + @property + def dtypes(self): + return self.apply(lambda x: x.dtype) + + def convert_objects(self): + """ + Attempt to infer better dtype for object columns + + Returns + ------- + converted : DataFrame + """ + new_data = {} + + # TODO: could be more efficient taking advantage of the block + for col, s in self.iteritems(): + if s.dtype == np.object_: + new_data[col] = lib.maybe_convert_objects(s) + else: + new_data[col] = s + + return DataFrame(new_data, index=self.index, columns=self.columns) + + def get_dtype_counts(self): + counts = {} + for _, series in self.iterkv(): + # endianness can cause dtypes to look different + dtype_str = str(series.dtype) + if dtype_str in counts: + counts[dtype_str] += 1 + else: + counts[dtype_str] = 1 + return Series(counts) + + #---------------------------------------------------------------------- + # properties for index and columns + + columns = lib.AxisProperty(0) + index = lib.AxisProperty(1) + + def as_matrix(self, columns=None): + """ + Convert the frame to its Numpy-array matrix representation. Columns + are presented in sorted order unless a specific list of columns is + provided. + + Parameters + ---------- + columns : array-like + Specific column order + + Returns + ------- + values : ndarray + If the DataFrame is heterogeneous and contains booleans or objects, + the result will be of dtype=object + """ + self._consolidate_inplace() + return self._data.as_matrix(columns).T + + values = property(fget=as_matrix) + + def transpose(self): + """ + Returns a DataFrame with the rows/columns switched. If the DataFrame is + homogeneously-typed, the data is not copied + """ + return self._constructor(data=self.values.T, index=self.columns, + columns=self.index, copy=False) + + T = property(transpose) + + def swapaxes(self, i, j): + """ + Like ndarray.swapaxes, equivalent to transpose + + Returns + ------- + swapped : DataFrame + View on original data (no copy) + """ + if i in (0, 1) and j in (0, 1): + if i == j: + return self + return self._constructor(data=self.values.T, index=self.columns, + columns=self.index, copy=False) + else: + raise ValueError('Axis numbers must be in (0, 1)') + + #---------------------------------------------------------------------- + # Picklability + + def __getstate__(self): + return self._data + + def __setstate__(self, state): + # old DataFrame pickle + if isinstance(state, BlockManager): + self._data = state + elif isinstance(state[0], dict): # pragma: no cover + self._unpickle_frame_compat(state) + else: # pragma: no cover + # old pickling format, for compatibility + self._unpickle_matrix_compat(state) + + # ordinarily created in NDFrame + self._item_cache = {} + + # legacy pickle formats + def _unpickle_frame_compat(self, state): # pragma: no cover + from pandas.core.common import _unpickle_array + if len(state) == 2: # pragma: no cover + series, idx = state + columns = sorted(series) + else: + series, cols, idx = state + columns = _unpickle_array(cols) + + index = _unpickle_array(idx) + self._data = self._init_dict(series, index, columns, None) + + def _unpickle_matrix_compat(self, state): # pragma: no cover + from pandas.core.common import _unpickle_array + # old unpickling + (vals, idx, cols), object_state = state + + index = _unpickle_array(idx) + dm = DataFrame(vals, index=index, columns=_unpickle_array(cols), + copy=False) + + if object_state is not None: + ovals, _, ocols = object_state + objects = DataFrame(ovals, index=index, + columns=_unpickle_array(ocols), + copy=False) + + dm = dm.join(objects) + + self._data = dm._data + + #---------------------------------------------------------------------- + # Array interface + + def __array__(self, dtype=None): + return self.values + + def __array_wrap__(self, result): + return self._constructor(result, index=self.index, + columns=self.columns, copy=False) + + #---------------------------------------------------------------------- + # Getting and setting elements + + def get_value(self, index, col): + """ + Quickly retrieve single value at passed column and index + + Parameters + ---------- + index : row label + col : column label + + Returns + ------- + value : scalar value + """ + series = self._get_item_cache(col) + engine = self.index._engine + return engine.get_value(series, index) + + def set_value(self, index, col, value): + """ + Put single value at passed column and index + + Parameters + ---------- + index : row label + col : column label + value : scalar value + + Returns + ------- + frame : DataFrame + If label pair is contained, will be reference to calling DataFrame, + otherwise a new object + """ + try: + series = self._get_item_cache(col) + engine = self.index._engine + engine.set_value(series, index, value) + return self + except KeyError: + new_index, new_columns = self._expand_axes((index, col)) + result = self.reindex(index=new_index, columns=new_columns, + copy=False) + likely_dtype = com._infer_dtype(value) + + made_bigger = not np.array_equal(new_columns, self.columns) + + # how to make this logic simpler? + if made_bigger: + com._possibly_cast_item(result, col, likely_dtype) + + return result.set_value(index, col, value) + + def irow(self, i, copy=False): + """ + Retrieve the i-th row or rows of the DataFrame by location + + Parameters + ---------- + i : int, slice, or sequence of integers + + Notes + ----- + If slice passed, the resulting data will be a view + + Returns + ------- + row : Series (int) or DataFrame (slice, sequence) + """ + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + return self.reindex(label) + else: + try: + new_values = self._data.fast_2d_xs(i, copy=copy) + except: + new_values = self._data.fast_2d_xs(i, copy=True) + return Series(new_values, index=self.columns, + name=self.index[i]) + + def icol(self, i): + """ + Retrieve the i-th column or columns of the DataFrame by location + + Parameters + ---------- + i : int, slice, or sequence of integers + + Notes + ----- + If slice passed, the resulting data will be a view + + Returns + ------- + column : Series (int) or DataFrame (slice, sequence) + """ + label = self.columns[i] + if isinstance(i, slice): + # need to return view + lab_slice = slice(label[0], label[-1]) + return self.ix[:, lab_slice] + else: + label = self.columns[i] + if isinstance(label, Index): + return self.reindex(columns=label) + + values = self._data.iget(i) + return Series(values, index=self.index, name=label) + + def _ixs(self, i, axis=0): + if axis == 0: + return self.irow(i) + else: + return self.icol(i) + + def iget_value(self, i, j): + """ + Return scalar value stored at row i and column j, where i and j are + integers + + Parameters + ---------- + i : int + j : int + + Returns + ------- + value : scalar value + """ + row = self.index[i] + col = self.columns[j] + return self.get_value(row, col) + + def __getitem__(self, key): + # slice rows + if isinstance(key, slice): + from pandas.core.indexing import _is_index_slice + idx_type = self.index.inferred_type + if idx_type == 'floating': + indexer = self.ix._convert_to_indexer(key, axis=0) + elif idx_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + new_data = self._data.get_slice(indexer, axis=1) + return self._constructor(new_data) + # either boolean or fancy integer index + elif isinstance(key, (np.ndarray, list)): + if isinstance(key, list): + key = lib.list_to_object_array(key) + + # also raises Exception if object array with NA values + if com._is_bool_indexer(key): + key = np.asarray(key, dtype=bool) + return self._getitem_array(key) + elif isinstance(self.columns, MultiIndex): + return self._getitem_multilevel(key) + elif isinstance(key, DataFrame): + values = key.values + if values.dtype == bool: + return self.values[values] + else: + raise ValueError('Cannot index using non-boolean DataFrame') + else: + return self._get_item_cache(key) + + def _getitem_array(self, key): + if key.dtype == np.bool_: + if len(key) != len(self.index): + raise ValueError('Item wrong length %d instead of %d!' % + (len(key), len(self.index))) + + inds, = key.nonzero() + return self.take(inds) + else: + indexer = self.columns.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise KeyError("No column(s) named: %s" % str(key[mask])) + result = self.reindex(columns=key) + if result.columns.name is None: + result.columns.name = self.columns.name + return result + + def _slice(self, slobj, axis=0): + if axis == 0: + mgr_axis = 1 + else: + mgr_axis = 0 + + new_data = self._data.get_slice(slobj, axis=mgr_axis) + return self._constructor(new_data) + + def _getitem_multilevel(self, key): + loc = self.columns.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_columns = self.columns[loc] + result_columns = _maybe_droplevels(new_columns, key) + if self._is_mixed_type: + result = self.reindex(columns=new_columns) + result.columns = result_columns + else: + new_values = self.values[:, loc] + result = DataFrame(new_values, index=self.index, + columns=result_columns) + if len(result.columns) == 1: + top = result.columns[0] + if (type(top) == str and top == '' or + type(top) == tuple and top[0] == ''): + result = Series(result[''], index=self.index, name=key) + return result + else: + return self._get_item_cache(key) + + def _box_item_values(self, key, values): + if values.ndim == 2: + item_cols = self.columns[self.columns.get_loc(key)] + return DataFrame(values.T, columns=item_cols, + index=self.index) + else: + return Series(values, index=self.index, name=key) + + def __getattr__(self, name): + """After regular attribute access, try looking up the name of a column. + This allows simpler access to columns for interactive use.""" + if name in self.columns: + return self[name] + raise AttributeError("'%s' object has no attribute '%s'" % + (type(self).__name__, name)) + + def __setattr__(self, name, value): + """After regular attribute access, try looking up the name of a column. + This allows simpler access to columns for interactive use.""" + if name == '_data': + super(DataFrame, self).__setattr__(name, value) + else: + try: + existing = getattr(self, name) + if isinstance(existing, Index): + super(DataFrame, self).__setattr__(name, value) + elif name in self.columns: + self[name] = value + else: + object.__setattr__(self, name, value) + except (AttributeError, TypeError): + object.__setattr__(self, name, value) + + def __setitem__(self, key, value): + # support boolean setting with DataFrame input, e.g. + # df[df > df2] = 0 + if isinstance(key, DataFrame): + if not (key.index.equals(self.index) and + key.columns.equals(self.columns)): + raise PandasError('Can only index with like-indexed ' + 'DataFrame objects') + + self._boolean_set(key, value) + elif isinstance(key, (np.ndarray, list)): + return self._set_item_multiple(key, value) + else: + # set column + self._set_item(key, value) + + def _boolean_set(self, key, value): + mask = key.values + if mask.dtype != np.bool_: + raise ValueError('Must pass DataFrame with boolean values only') + + if self._is_mixed_type: + raise ValueError('Cannot do boolean setting on mixed-type frame') + + if isinstance(value, DataFrame): + assert(value._indexed_same(self)) + np.putmask(self.values, mask, value.values) + else: + self.values[mask] = value + + def _set_item_multiple(self, keys, value): + if isinstance(value, DataFrame): + assert(len(value.columns) == len(keys)) + for k1, k2 in zip(keys, value.columns): + self[k1] = value[k2] + else: + if isinstance(keys, np.ndarray) and keys.dtype == np.bool_: + # boolean slicing should happen on rows, consistent with + # behavior of getitem + self.ix[keys, :] = value + else: + self.ix[:, keys] = value + + def _set_item(self, key, value): + """ + Add series to DataFrame in specified column. + + If series is a numpy-array (not a Series/TimeSeries), it must be the + same length as the DataFrame's index or an error will be thrown. + + Series/TimeSeries will be conformed to the DataFrame's index to + ensure homogeneity. + """ + value = self._sanitize_column(key, value) + NDFrame._set_item(self, key, value) + + def insert(self, loc, column, value): + """ + Insert column into DataFrame at specified location. Raises Exception if + column is already contained in the DataFrame + + Parameters + ---------- + loc : int + Must have 0 <= loc <= len(columns) + column : object + value : int, Series, or array-like + """ + value = self._sanitize_column(column, value) + self._data.insert(loc, column, value) + + def _sanitize_column(self, key, value): + # Need to make sure new columns (which go into the BlockManager as new + # blocks) are always copied + if _is_sequence(value): + if isinstance(value, Series): + if value.index.equals(self.index): + # copy the values + value = value.values.copy() + else: + value = value.reindex(self.index).values + else: + assert(len(value) == len(self.index)) + + if not isinstance(value, np.ndarray): + value = com._asarray_tuplesafe(value) + else: + value = value.copy() + else: + value = np.repeat(value, len(self.index)) + if key in self.columns: + existing_column = self[key] + # special case for now + if (com.is_float_dtype(existing_column) and + com.is_integer_dtype(value)): + value = value.astype(np.float64) + + return np.atleast_2d(np.asarray(value)) + + def pop(self, item): + """ + Return column and drop from frame. Raise KeyError if not found. + + Returns + ------- + column : Series + """ + return NDFrame.pop(self, item) + + # to support old APIs + @property + def _series(self): + return self._data.get_series_dict() + + def xs(self, key, axis=0, level=None, copy=True): + """ + Returns a cross-section (row or column) from the DataFrame as a Series + object. Defaults to returning a row (axis 0) + + Parameters + ---------- + key : object + Some label contained in the index, or partially in a MultiIndex + axis : int, default 0 + Axis to retrieve cross-section on + copy : boolean, default True + Whether to make a copy of the data + + Returns + ------- + xs : Series + """ + labels = self._get_axis(axis) + if level is not None: + loc, new_ax = labels.get_loc_level(key, level=level) + + # level = 0 + if not isinstance(loc, slice): + indexer = [slice(None, None)] * 2 + indexer[axis] = loc + indexer = tuple(indexer) + else: + indexer = loc + + result = self.ix[indexer] + setattr(result, result._get_axis_name(axis), new_ax) + return result + + if axis == 1: + data = self[key] + if copy: + data = data.copy() + return data + + self._consolidate_inplace() + + index = self.index + if isinstance(index, MultiIndex): + loc, new_index = self.index.get_loc_level(key) + else: + loc = self.index.get_loc(key) + + if isinstance(loc, np.ndarray): + if loc.dtype == np.bool_: + inds, = loc.nonzero() + if len(inds) == 1: + loc = inds[0] + else: + return self.take(inds, axis=axis) + else: + return self.take(loc, axis=axis) + + if not np.isscalar(loc): + new_index = self.index[loc] + + if np.isscalar(loc): + new_values = self._data.fast_2d_xs(loc, copy=copy) + return Series(new_values, index=self.columns, name=key) + else: # isinstance(loc, slice) or loc.dtype == np.bool_: + result = self[loc] + result.index = new_index + return result + # else: + # return self.take(loc) + + def lookup(self, row_labels, col_labels): + """ + Label-based "fancy indexing" function for DataFrame. Given equal-length + arrays of row and column labels, return an array of the values + corresponding to each (row, col) pair. + + Parameters + ---------- + row_labels : sequence + col_labels : sequence + + Notes + ----- + Akin to + + result = [] + for row, col in zip(row_labels, col_labels): + result.append(df.get_value(row, col)) + + Example + ------- + values : ndarray + """ + from itertools import izip + + n = len(row_labels) + assert(n == len(col_labels)) + + thresh = 1000 + if not self._is_mixed_type or n > thresh: + values = self.values + ridx = self.index.get_indexer(row_labels) + cidx = self.columns.get_indexer(col_labels) + if (ridx == -1).any(): + raise ValueError('One or more row labels was not found') + if (cidx == -1).any(): + raise ValueError('One or more column labels was not found') + flat_index = ridx * len(self.columns) + cidx + result = values.flat[flat_index] + else: + result = np.empty(n, dtype='O') + for i, (r, c) in enumerate(izip(row_labels, col_labels)): + result[i] = self.get_value(r, c) + + if result.dtype == 'O': + result = lib.maybe_convert_objects(result) + + return result + + #---------------------------------------------------------------------- + # Reindexing and alignment + + def align(self, other, join='outer', axis=None, level=None, copy=True, + fill_value=np.nan, method=None, limit=None, fill_axis=0): + """ + Align two DataFrame object on their index and columns with the + specified join method for each axis Index + + Parameters + ---------- + other : DataFrame or Series + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + axis : {0, 1, None}, default None + Align on index (0), columns (1), or both (None) + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + copy : boolean, default True + Always returns new objects. If copy=False and no reindexing is + required then original objects are returned. + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value + method : str, default None + limit : int, default None + fill_axis : {0, 1}, default 0 + Filling axis, method and limit + + Returns + ------- + (left, right) : (DataFrame, type of other) + Aligned objects + """ + if isinstance(other, DataFrame): + return self._align_frame(other, join=join, axis=axis, level=level, + copy=copy, fill_value=fill_value, + method=method, limit=limit, + fill_axis=fill_axis) + elif isinstance(other, Series): + return self._align_series(other, join=join, axis=axis, level=level, + copy=copy, fill_value=fill_value, + method=method, limit=limit, + fill_axis=fill_axis) + else: # pragma: no cover + raise TypeError('unsupported type: %s' % type(other)) + + def _align_frame(self, other, join='outer', axis=None, level=None, + copy=True, fill_value=np.nan, method=None, limit=None, + fill_axis=0): + # defaults + join_index, join_columns = None, None + ilidx, iridx = None, None + clidx, cridx = None, None + + if axis is None or axis == 0: + if not self.index.equals(other.index): + join_index, ilidx, iridx = \ + self.index.join(other.index, how=join, level=level, + return_indexers=True) + + if axis is None or axis == 1: + if not self.columns.equals(other.columns): + join_columns, clidx, cridx = \ + self.columns.join(other.columns, how=join, level=level, + return_indexers=True) + + left = self._reindex_with_indexers(join_index, ilidx, + join_columns, clidx, copy, + fill_value=fill_value) + right = other._reindex_with_indexers(join_index, iridx, + join_columns, cridx, copy, + fill_value=fill_value) + + if method is not None: + left = left.fillna(axis=fill_axis, method=method, limit=limit) + right = right.fillna(axis=fill_axis, method=method, limit=limit) + + return left, right + + def _align_series(self, other, join='outer', axis=None, level=None, + copy=True, fill_value=None, method=None, limit=None, + fill_axis=0): + fdata = self._data + if axis == 0: + join_index = self.index + lidx, ridx = None, None + if not self.index.equals(other.index): + join_index, lidx, ridx = self.index.join(other.index, how=join, + return_indexers=True) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=1) + elif axis == 1: + join_index = self.columns + lidx, ridx = None, None + if not self.columns.equals(other.index): + join_index, lidx, ridx = \ + self.columns.join(other.index, how=join, + return_indexers=True) + + if lidx is not None: + fdata = fdata.reindex_indexer(join_index, lidx, axis=0) + else: + raise ValueError('Must specify axis=0 or 1') + + if copy and fdata is self._data: + fdata = fdata.copy() + + left_result = DataFrame(fdata) + right_result = other if ridx is None else other.reindex(join_index) + + fill_na = notnull(fill_value) or (method is not None) + if fill_na: + return (left_result.fillna(fill_value, method=method, limit=limit, + axis=fill_axis), + right_result.fillna(fill_value, method=method, limit=limit)) + else: + return left_result, right_result + + def reindex(self, index=None, columns=None, method=None, level=None, + fill_value=np.nan, limit=None, copy=True): + """Conform DataFrame to new index with optional filling logic, placing + NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + copy=False + + Parameters + ---------- + index : array-like, optional + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + columns : array-like, optional + Same usage as index argument + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed DataFrame + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value + limit : int, default None + Maximum size gap to forward or backward fill + + Examples + -------- + >>> df.reindex(index=[date1, date2, date3], columns=['A', 'B', 'C']) + + Returns + ------- + reindexed : same type as calling instance + """ + self._consolidate_inplace() + frame = self + + if (index is not None and columns is not None + and method is None and level is None + and not self._is_mixed_type): + return self._reindex_multi(index, columns, copy, fill_value) + + if columns is not None: + frame = frame._reindex_columns(columns, copy, level, + fill_value, limit) + + if index is not None: + frame = frame._reindex_index(index, method, copy, level, + fill_value, limit) + + return frame + + def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, + limit=None, fill_value=np.nan): + """Conform DataFrame to new index with optional filling logic, placing + NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + copy=False + + Parameters + ---------- + index : array-like, optional + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + axis : {0, 1} + 0 -> index (rows) + 1 -> columns + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed DataFrame + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + limit : int, default None + Maximum size gap to forward or backward fill + + Examples + -------- + >>> df.reindex_axis(['A', 'B', 'C'], axis=1) + + See also + -------- + DataFrame.reindex, DataFrame.reindex_like + + Returns + ------- + reindexed : same type as calling instance + """ + self._consolidate_inplace() + if axis == 0: + return self._reindex_index(labels, method, copy, level, + fill_value=fill_value, + limit=limit) + elif axis == 1: + return self._reindex_columns(labels, copy, level, + fill_value=fill_value, + limit=limit) + else: # pragma: no cover + raise ValueError('Must specify axis=0 or 1') + + def _reindex_multi(self, new_index, new_columns, copy, fill_value): + new_index, row_indexer = self.index.reindex(new_index) + new_columns, col_indexer = self.columns.reindex(new_columns) + + if row_indexer is not None and col_indexer is not None: + new_values = com.take_2d_multi(self.values, row_indexer, + col_indexer, fill_value=fill_value) + return DataFrame(new_values, index=new_index, columns=new_columns) + elif row_indexer is not None: + return self._reindex_with_indexers(new_index, row_indexer, + None, None, copy, fill_value) + elif col_indexer is not None: + return self._reindex_with_indexers(None, None, + new_columns, col_indexer, + copy, fill_value) + else: + return self.copy() if copy else self + + def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan, + limit=None): + new_index, indexer = self.index.reindex(new_index, method, level, + limit=limit) + return self._reindex_with_indexers(new_index, indexer, None, None, + copy, fill_value) + + def _reindex_columns(self, new_columns, copy, level, fill_value=np.nan, + limit=None): + new_columns, indexer = self.columns.reindex(new_columns, level=level, + limit=limit) + return self._reindex_with_indexers(None, None, new_columns, indexer, + copy, fill_value) + + def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer, + copy, fill_value): + new_data = self._data + if row_indexer is not None: + row_indexer = com._ensure_int64(row_indexer) + new_data = new_data.reindex_indexer(index, row_indexer, axis=1, + fill_value=fill_value) + elif index is not None and index is not new_data.axes[1]: + new_data = new_data.copy(deep=copy) + new_data.axes[1] = index + + if col_indexer is not None: + # TODO: speed up on homogeneous DataFrame objects + col_indexer = com._ensure_int64(col_indexer) + new_data = new_data.reindex_indexer(columns, col_indexer, axis=0, + fill_value=fill_value) + elif columns is not None and columns is not new_data.axes[0]: + new_data = new_data.reindex_items(columns, copy=copy, + fill_value=fill_value) + + if copy and new_data is self._data: + new_data = new_data.copy() + + return DataFrame(new_data) + + def reindex_like(self, other, method=None, copy=True, limit=None): + """ + Reindex DataFrame to match indices of another DataFrame, optionally + with filling logic + + Parameters + ---------- + other : DataFrame + method : string or None + copy : boolean, default True + limit : int, default None + Maximum size gap to forward or backward fill + + Notes + ----- + Like calling s.reindex(index=other.index, columns=other.columns, + method=...) + + Returns + ------- + reindexed : DataFrame + """ + return self.reindex(index=other.index, columns=other.columns, + method=method, copy=copy, limit=limit) + + truncate = generic.truncate + + def set_index(self, keys, drop=True, inplace=False, + verify_integrity=False): + """ + Set the DataFrame index (row labels) using one or more existing + columns. By default yields a new object. + + Parameters + ---------- + keys : column label or list of column labels / arrays + drop : boolean, default True + Delete columns to be used as the new index + inplace : boolean, default False + Modify the DataFrame in place (do not create a new object) + verify_integrity : boolean, default False + Check the new index for duplicates. Otherwise defer the check until + necessary. Setting to False will improve the performance of this + method + + Examples + -------- + indexed_df = df.set_index(['A', 'B']) + indexed_df2 = df.set_index(['A', [0, 1, 2, 0, 1, 2]]) + indexed_df3 = df.set_index([[0, 1, 2, 0, 1, 2]]) + + Returns + ------- + dataframe : DataFrame + """ + if not isinstance(keys, (list, tuple)): + keys = [keys] + + if inplace: + frame = self + + else: + frame = self.copy() + + arrays = [] + for col in keys: + if isinstance(col, (list, Series, np.ndarray)): + level = col + else: + level = frame[col] + if drop: + del frame[col] + arrays.append(level) + + index = MultiIndex.from_arrays(arrays, names=keys) + + if verify_integrity and not index.is_unique: + duplicates = index.get_duplicates() + raise Exception('Index has duplicate keys: %s' % duplicates) + + # clear up memory usage + index._cleanup() + + frame.index = index + return frame + + def reset_index(self, drop=False): + """ + For DataFrame with multi-level index, return new DataFrame with + labeling information in the columns under the index names, defaulting + to 'level_0', 'level_1', etc. if any are None. For a standard index, + the index name will be used (if set), otherwise a default 'index' or + 'level_0' (if 'index' is already taken) will be used. + + Parameters + ---------- + drop : boolean, default False + Do not try to insert index into dataframe columns + + Returns + ------- + resetted : DataFrame + """ + new_obj = self.copy() + + def _maybe_cast(values): + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values) + return values + + if not drop: + if isinstance(self.index, MultiIndex): + names = self.index.names + zipped = zip(self.index.levels, self.index.labels) + for i, (lev, lab) in reversed(list(enumerate(zipped))): + col_name = names[i] + if col_name is None: + col_name = 'level_%d' % i + + # to ndarray and maybe infer different dtype + level_values = _maybe_cast(lev.values) + new_obj.insert(0, col_name, level_values.take(lab)) + else: + name = self.index.name + if name is None or name == 'index': + name = 'index' if 'index' not in self else 'level_0' + new_obj.insert(0, name, _maybe_cast(self.index.values)) + new_obj.index = np.arange(len(new_obj)) + return new_obj + + delevel = deprecate('delevel', reset_index) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take, return DataFrame corresponding to requested + indices along an axis + + Parameters + ---------- + indices : list / array of ints + axis : {0, 1} + + Returns + ------- + taken : DataFrame + """ + if isinstance(indices, list): + indices = np.array(indices) + if self._data.is_mixed_dtype(): + if axis == 0: + new_data = self._data.take(indices, axis=1) + return DataFrame(new_data) + else: + new_columns = self.columns.take(indices) + return self.reindex(columns=new_columns) + else: + new_values = com.take_2d(self.values, + com._ensure_int64(indices), + axis=axis) + if axis == 0: + new_columns = self.columns + new_index = self.index.take(indices) + else: + new_columns = self.columns.take(indices) + new_index = self.index + return DataFrame(new_values, index=new_index, + columns=new_columns) + + #---------------------------------------------------------------------- + # Reindex-based selection methods + + def filter(self, items=None, like=None, regex=None): + """ + Restrict frame's columns to set of items or wildcard + + Parameters + ---------- + items : list-like + List of columns to restrict to (must not all be present) + like : string + Keep columns where "arg in col == True" + regex : string (regular expression) + Keep columns with re.search(regex, col) == True + + Notes + ----- + Arguments are mutually exclusive, but this is not checked for + + Returns + ------- + DataFrame with filtered columns + """ + import re + if items is not None: + return self.reindex(columns=[r for r in items if r in self]) + elif like: + return self.select(lambda x: like in x, axis=1) + elif regex: + matcher = re.compile(regex) + return self.select(lambda x: matcher.match(x) is not None, axis=1) + else: + raise ValueError('items was None!') + + def dropna(self, axis=0, how='any', thresh=None, subset=None): + """ + Return object with labels on given axis omitted where alternately any + or all of the data are missing + + Parameters + ---------- + axis : {0, 1} + how : {'any', 'all'} + any : if any NA values are present, drop that label + all : if all values are NA, drop that label + thresh : int, default None + int value : require that many non-NA values + subset : array-like + Labels along other axis to consider, e.g. if you are dropping rows + these would be a list of columns to include + + Returns + ------- + dropped : DataFrame + """ + axis_name = self._get_axis_name(axis) + + if axis == 0: + agg_axis = 1 + elif axis == 1: + agg_axis = 0 + else: # pragma: no cover + raise ValueError('axis must be 0 or 1') + + agg_obj = self + if subset is not None: + agg_axis_name = self._get_axis_name(agg_axis) + agg_obj = self.reindex(**{agg_axis_name: subset}) + + count = agg_obj.count(axis=agg_axis) + + if thresh is not None: + mask = count >= thresh + elif how == 'any': + mask = count == len(agg_obj._get_axis(agg_axis)) + elif how == 'all': + mask = count > 0 + else: + if how is not None: + raise ValueError('do not recognize %s' % how) + else: + raise ValueError('must specify how or thresh') + + labels = self._get_axis(axis) + new_labels = labels[mask] + return self.reindex(**{axis_name: new_labels}) + + def drop_duplicates(self, cols=None, take_last=False, inplace=False): + """ + Return DataFrame with duplicate rows removed, optionally only + considering certain columns + + Parameters + ---------- + cols : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns + take_last : boolean, default False + Take the last observed row in a row. Defaults to the first row + skipna : boolean, default True + If True then keep NaN + inplace : boolean, default False + Whether to drop duplicates in place or to return a copy + + Returns + ------- + deduplicated : DataFrame + """ + + duplicated = self.duplicated(cols, take_last=take_last) + + if inplace: + inds, = (-duplicated).nonzero() + self._data = self._data.take(inds) + self._clear_item_cache() + return self + else: + return self[-duplicated] + + def duplicated(self, cols=None, take_last=False): + """ + Return boolean Series denoting duplicate rows, optionally only + considering certain columns + + Parameters + ---------- + cols : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns + take_last : boolean, default False + Take the last observed row in a row. Defaults to the first row + + Returns + ------- + duplicated : Series + """ + if cols is not None: + if isinstance(cols, list): + values = [self[x].values for x in cols] + keys = lib.fast_zip_fillna(values) + else: + keys = lib.fast_zip_fillna([self[cols]]) + else: + values = list(self.values.T) + keys = lib.fast_zip_fillna(values) + + duplicated = lib.duplicated(keys, take_last=take_last) + return Series(duplicated, index=self.index) + + #---------------------------------------------------------------------- + # Sorting + + def sort(self, columns=None, column=None, axis=0, ascending=True, + inplace=False): + """ + Sort DataFrame either by labels (along either axis) or by the values in + column(s) + + Parameters + ---------- + columns : object + Column name(s) in frame. Accepts a column name or a list or tuple + for a nested sort. + ascending : boolean, default True + Sort ascending vs. descending + axis : {0, 1} + Sort index/rows versus columns + inplace : boolean, default False + Sort the DataFrame without creating a new instance + + Returns + ------- + sorted : DataFrame + """ + if column is not None: # pragma: no cover + import warnings + warnings.warn("column is deprecated, use columns", FutureWarning) + columns = column + return self.sort_index(by=columns, axis=axis, ascending=ascending, + inplace=inplace) + + def sort_index(self, axis=0, by=None, ascending=True, inplace=False): + """ + Sort DataFrame either by labels (along either axis) or by the values in + a column + + Parameters + ---------- + axis : {0, 1} + Sort index/rows versus columns + by : object + Column name(s) in frame. Accepts a column name or a list or tuple + for a nested sort. + ascending : boolean, default True + Sort ascending vs. descending + inplace : boolean, default False + Sort the DataFrame without creating a new instance + + Returns + ------- + sorted : DataFrame + """ + from pandas.core.groupby import _lexsort_indexer + + if axis not in [0, 1]: + raise ValueError('Axis must be 0 or 1, got %s' % str(axis)) + + labels = self._get_axis(axis) + + if by is not None: + assert(axis == 0) + if isinstance(by, (tuple, list)): + keys = [self[x].values for x in by] + indexer = _lexsort_indexer(keys) + else: + indexer = self[by].values.argsort() + else: + indexer = labels.argsort() + + if not ascending: + indexer = indexer[::-1] + + if inplace: + if axis == 1: + self._data = self._data.reindex_items(self._data.items[indexer], + copy=False) + elif axis == 0: + self._data = self._data.take(indexer) + + self._clear_item_cache() + return self + else: + return self.take(indexer, axis=axis) + + def sortlevel(self, level=0, axis=0, ascending=True): + """ + Sort multilevel index by chosen axis and primary level. Data will be + lexicographically sorted by the chosen level followed by the other + levels (in order) + + Parameters + ---------- + level : int + axis : {0, 1} + ascending : bool, default True + + Returns + ------- + sorted : DataFrame + """ + the_axis = self._get_axis(axis) + if not isinstance(the_axis, MultiIndex): + raise Exception('can only sort by level with a hierarchical index') + + new_axis, indexer = the_axis.sortlevel(level, ascending=ascending) + + if self._data.is_mixed_dtype(): + if axis == 0: + return self.reindex(index=new_axis) + else: + return self.reindex(columns=new_axis) + + if axis == 0: + index = new_axis + columns = self.columns + else: + index = self.index + columns = new_axis + new_values = self.values.take(indexer, axis=axis) + return self._constructor(new_values, index=index, columns=columns) + + def swaplevel(self, i, j, axis=0): + """ + Swap levels i and j in a MultiIndex on a particular axis + + Returns + ------- + swapped : type of caller (new object) + """ + result = self.copy() + + if axis == 0: + result.index = result.index.swaplevel(i, j) + else: + result.columns = result.columns.swaplevel(i, j) + return result + + def reorder_levels(self, order, axis=0): + """ + Rearrange index levels using input order. + May not drop or duplicate levels + + Parameters + ---------- + order: list of int representing new level order. + (reference level by number not by key) + axis: where to reorder levels + + Returns + ------- + type of caller (new object) + """ + if not isinstance(self._get_axis(axis), + MultiIndex): # pragma: no cover + raise Exception('Can only reorder levels on a hierarchical axis.') + + result = self.copy() + + if axis == 0: + result.index = result.index.reorder_levels(order) + else: + result.columns = result.columns.reorder_levels(order) + return result + + #---------------------------------------------------------------------- + # Filling NA's + + def fillna(self, value=None, method='pad', axis=0, inplace=False, + limit=None): + """ + Fill NA/NaN values using the specified method + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + value : scalar or dict + Value to use to fill holes (e.g. 0), alternately a dict of values + specifying which value to use for each column (columns not in the + dict will not be filled) + axis : {0, 1}, default 0 + 0: fill column-by-column + 1: fill row-by-row + inplace : boolean, default False + If True, fill the DataFrame in place. Note: this will modify any + other views on this DataFrame, like if you took a no-copy slice of + an existing DataFrame, for example a column in a DataFrame. Returns + a reference to the filled object, which is self if inplace=True + limit : int, default None + Maximum size gap to forward or backward fill + + See also + -------- + reindex, asfreq + + Returns + ------- + filled : DataFrame + """ + self._consolidate_inplace() + + if value is None: + if self._is_mixed_type and axis == 1: + return self.T.fillna(method=method, limit=limit).T + + new_blocks = [] + method = com._clean_fill_method(method) + for block in self._data.blocks: + if block._can_hold_na: + newb = block.interpolate(method, axis=axis, + limit=limit, inplace=inplace) + else: + newb = block if inplace else block.copy() + new_blocks.append(newb) + + new_data = BlockManager(new_blocks, self._data.axes) + else: + # Float type values + if len(self.columns) == 0: + return self + if isinstance(value, (dict, Series)): + if axis == 1: + raise NotImplementedError('Currently only can fill ' + 'with dict/Series column ' + 'by column') + + result = self if inplace else self.copy() + for k, v in value.iteritems(): + if k not in result: + continue + result[k].fillna(v, inplace=True) + return result + else: + new_data = self._data.fillna(value, inplace=inplace) + + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + + def replace(self, to_replace, value=None, method='pad', axis=0, + inplace=False, limit=None): + """ + Replace values given in 'to_replace' with 'value' or using 'method' + + Parameters + ---------- + value : scalar or dict, default None + Value to use to fill holes (e.g. 0), alternately a dict of values + specifying which value to use for each column (columns not in the + dict will not be filled) + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + axis : {0, 1}, default 0 + 0: fill column-by-column + 1: fill row-by-row + inplace : boolean, default False + If True, fill the DataFrame in place. Note: this will modify any + other views on this DataFrame, like if you took a no-copy slice of + an existing DataFrame, for example a column in a DataFrame. Returns + a reference to the filled object, which is self if inplace=True + limit : int, default None + Maximum size gap to forward or backward fill + + See also + -------- + reindex, asfreq + + Returns + ------- + filled : DataFrame + """ + self._consolidate_inplace() + + if value is None: + return self._interpolate(to_replace, method, axis, inplace, limit) + else: + if len(self.columns) == 0: + return self + + if isinstance(to_replace, dict): + if isinstance(value, dict): # {'A' : np.nan} -> {'A' : 0} + return self._replace_both_dict(to_replace, value, inplace) + + elif not isinstance(value, (list, np.ndarray)): + return self._replace_src_dict(to_replace, value, inplace) + + raise ValueError('Fill value must be scalar or dict') + + elif isinstance(to_replace, (list, np.ndarray)): + # [np.nan, ''] -> [0, 'missing'] + if isinstance(value, (list, np.ndarray)): + if len(to_replace) != len(value): + raise ValueError('Replacement lists must match ' + 'in length. Expecting %d got %d ' % + (len(to_replace), len(value))) + + new_data = self._data if inplace else self.copy()._data + new_data._replace_list(to_replace, value) + + else: # [np.nan, ''] -> 0 + new_data = self._data.replace(to_replace, value, + inplace=inplace) + + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + else: + if isinstance(value, dict): # np.nan -> {'A' : 0, 'B' : -1} + return self._replace_dest_dict(to_replace, value, inplace) + elif not isinstance(value, (list, np.ndarray)): # np.nan -> 0 + new_data = self._data.replace(to_replace, value, + inplace=inplace) + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + + raise ValueError('Invalid to_replace type: %s' % + type(to_replace)) # pragma: no cover + + def _interpolate(self, to_replace, method, axis, inplace, limit): + if self._is_mixed_type and axis == 1: + return self.T.replace(to_replace, method=method, limit=limit).T + + method = com._clean_fill_method(method) + + if isinstance(to_replace, dict): + if axis == 1: + return self.T.replace(to_replace, method=method, + limit=limit).T + + rs = self if inplace else self.copy() + for k, v in to_replace.iteritems(): + if k in rs: + rs[k].replace(v, method=method, limit=limit, + inplace=True) + return rs + + else: + new_blocks = [] + for block in self._data.blocks: + newb = block.interpolate(method, axis=axis, + limit=limit, inplace=inplace, + missing=to_replace) + new_blocks.append(newb) + new_data = BlockManager(new_blocks, self._data.axes) + + if inplace: + self._data = new_data + return self + else: + return self._constructor(new_data) + + def _replace_dest_dict(self, to_replace, value, inplace): + rs = self if inplace else self.copy() + for k, v in value.iteritems(): + if k in rs: + rs[k].replace(to_replace, v, inplace=True) + return rs + + def _replace_src_dict(self, to_replace, value, inplace): + rs = self if inplace else self.copy() + for k, src in to_replace.iteritems(): + if k in rs: + rs[k].replace(src, value, inplace=True) + return rs + + def _replace_both_dict(self, to_replace, value, inplace): + rs = self if inplace else self.copy() + for c, src in to_replace.iteritems(): + if c in value and c in rs: + rs[c].replace(src, value[c], inplace=True) + return rs + + #---------------------------------------------------------------------- + # Rename + + def rename(self, index=None, columns=None, copy=True, inplace=False): + """ + Alter index and / or columns using input function or + functions. Function / dict values must be unique (1-to-1). Labels not + contained in a dict / Series will be left as-is. + + Parameters + ---------- + index : dict-like or function, optional + Transformation to apply to index values + columns : dict-like or function, optional + Transformation to apply to column values + copy : boolean, default True + Also copy underlying data + inplace : boolean, default False + Whether to return a new DataFrame. If True then value of copy is + ignored. + + See also + -------- + Series.rename + + Returns + ------- + renamed : DataFrame (new object) + """ + from pandas.core.series import _get_rename_function + + if index is None and columns is None: + raise Exception('must pass either index or columns') + + index_f = _get_rename_function(index) + columns_f = _get_rename_function(columns) + + self._consolidate_inplace() + + result = self if inplace else self.copy(deep=copy) + + if index is not None: + result._rename_index_inplace(index_f) + + if columns is not None: + result._rename_columns_inplace(columns_f) + + return result + + def _rename_index_inplace(self, mapper): + self._data = self._data.rename_axis(mapper, axis=1) + self._clear_item_cache() + + def _rename_columns_inplace(self, mapper): + self._data = self._data.rename_items(mapper, copydata=False) + self._clear_item_cache() + + #---------------------------------------------------------------------- + # Arithmetic / combination related + + def _combine_frame(self, other, func, fill_value=None, level=None): + this, other = self.align(other, join='outer', level=level, copy=False) + new_index, new_columns = this.index, this.columns + + this_vals = this.values + other_vals = other.values + + if fill_value is not None: + this_mask = isnull(this_vals) + other_mask = isnull(other_vals) + this_vals = this_vals.copy() + other_vals = other_vals.copy() + + # one but not both + mask = this_mask ^ other_mask + this_vals[this_mask & mask] = fill_value + other_vals[other_mask & mask] = fill_value + + result = func(this_vals, other_vals) + return self._constructor(result, index=new_index, columns=new_columns, + copy=False) + + def _indexed_same(self, other): + same_index = self.index.equals(other.index) + same_columns = self.columns.equals(other.columns) + return same_index and same_columns + + def _combine_series(self, other, func, fill_value=None, axis=None, + level=None): + if axis is not None: + axis = self._get_axis_name(axis) + if axis == 'index': + return self._combine_match_index(other, func, fill_value) + else: + return self._combine_match_columns(other, func, fill_value) + return self._combine_series_infer(other, func, fill_value) + + def _combine_series_infer(self, other, func, fill_value=None): + if len(other) == 0: + return self * nan + + if len(self) == 0: + # Ambiguous case, use _series so works with DataFrame + return self._constructor(data=self._series, index=self.index, + columns=self.columns) + + # teeny hack because one does DataFrame + TimeSeries all the time + if self.index.is_all_dates and other.index.is_all_dates: + return self._combine_match_index(other, func, fill_value) + else: + return self._combine_match_columns(other, func, fill_value) + + def _combine_match_index(self, other, func, fill_value=None): + left, right = self.align(other, join='outer', axis=0, copy=False) + if fill_value is not None: + raise NotImplementedError + return self._constructor(func(left.values.T, right.values).T, + index=left.index, + columns=self.columns, copy=False) + + def _combine_match_columns(self, other, func, fill_value=None): + left, right = self.align(other, join='outer', axis=1, copy=False) + if fill_value is not None: + raise NotImplementedError + + return self._constructor(func(left.values, right.values), + index=self.index, + columns=left.columns, copy=False) + + def _combine_const(self, other, func): + if self.empty: + return self + + result_values = func(self.values, other) + + if not isinstance(result_values, np.ndarray): + raise TypeError('Could not compare %s with DataFrame values' + % repr(other)) + + return self._constructor(result_values, index=self.index, + columns=self.columns, copy=False) + + def _compare_frame(self, other, func): + if not self._indexed_same(other): + raise Exception('Can only compare identically-labeled ' + 'DataFrame objects') + + new_data = {} + for col in self.columns: + new_data[col] = func(self[col], other[col]) + + return self._constructor(data=new_data, index=self.index, + columns=self.columns, copy=False) + + def _flex_compare_frame(self, other, func, level): + if not self._indexed_same(other): + self, other = self.align(other, 'outer', level=level) + + new_data = {} + for col in self.columns: + new_data[col] = func(self[col], other[col]) + + return self._constructor(data=new_data, index=self.index, + columns=self.columns, copy=False) + + def combine(self, other, func, fill_value=None): + """ + Add two DataFrame objects and do not propagate NaN values, so if for a + (column, time) one frame is missing a value, it will default to the + other frame's value (which might be NaN as well) + + Parameters + ---------- + other : DataFrame + func : function + fill_value : scalar value + + Returns + ------- + result : DataFrame + """ + if other.empty: + return self.copy() + + if self.empty: + return other.copy() + + this, other = self.align(other, copy=False) + new_index = this.index + + # sorts if possible + new_columns = this.columns.union(other.columns) + do_fill = fill_value is not None + + result = {} + for col in new_columns: + series = this[col].values + otherSeries = other[col].values + + if do_fill: + this_mask = isnull(series) + other_mask = isnull(otherSeries) + series = series.copy() + otherSeries = otherSeries.copy() + series[this_mask] = fill_value + otherSeries[other_mask] = fill_value + + arr = func(series, otherSeries) + + if do_fill: + arr = com.ensure_float(arr) + arr[this_mask & other_mask] = nan + + result[col] = arr + + return self._constructor(result, index=new_index, columns=new_columns) + + def combine_first(self, other): + """ + Combine two DataFrame objects and default to non-null values in frame + calling the method. Result index will be the union of the two indexes + + Parameters + ---------- + other : DataFrame + + Examples + -------- + >>> a.combine_first(b) + a's values prioritized, use values from b to fill holes + + Returns + ------- + combined : DataFrame + """ + combiner = lambda x, y: np.where(isnull(x), y, x) + return self.combine(other, combiner) + + def update(self, other, join='left', overwrite=True, filter_func=None, + raise_conflict=False): + """ + Modify DataFrame in place using non-NA values from passed + DataFrame. Aligns on indices + + Parameters + ---------- + other : DataFrame + join : {'left', 'right', 'outer', 'inner'}, default 'left' + overwrite : boolean, default True + If True then overwrite values for common keys in the calling frame + filter_func : callable(1d-array) -> 1d-array, default None + Can choose to replace values other than NA. Return True for values + that should be updated + raise_conflict : bool + If True, will raise an error if the DataFrame and other both + contain data in the same place. + """ + if join != 'left': + raise NotImplementedError + + other = other.reindex_like(self) + for col in self.columns: + this = self[col].values + that = other[col].values + if filter_func is not None: + mask = -filter_func(this) | isnull(that) + else: + if raise_conflict: + mask_this = notnull(that) + mask_that = notnull(this) + if any(mask_this & mask_that): + raise DataConflictError("Data overlaps.") + + if overwrite: + mask = isnull(that) + else: + mask = notnull(this) + self[col] = np.where(mask, this, that) + + #---------------------------------------------------------------------- + # Misc methods + + def first_valid_index(self): + """ + Return label for first non-NA/null value + """ + return self.index[self.count(1) > 0][0] + + def last_valid_index(self): + """ + Return label for last non-NA/null value + """ + return self.index[self.count(1) > 0][-1] + + def head(self, n=5): + """Returns first n rows of DataFrame + """ + return self[:n] + + def tail(self, n=5): + """Returns last n rows of DataFrame + """ + return self[-n:] + + #---------------------------------------------------------------------- + # Data reshaping + + def pivot(self, index=None, columns=None, values=None): + """ + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from index / columns to form axes and return either + DataFrame or Panel, depending on whether you request a single value + column (DataFrame) or all columns (Panel) + + Parameters + ---------- + index : string or object + Column name to use to make new frame's index + columns : string or object + Column name to use to make new frame's columns + values : string or object, optional + Column name to use for populating new frame's values + + Notes + ----- + For finer-tuned control, see hierarchical indexing documentation along + with the related stack/unstack methods + + Examples + -------- + >>> df + foo bar baz + 0 one A 1. + 1 one B 2. + 2 one C 3. + 3 two A 4. + 4 two B 5. + 5 two C 6. + + >>> df.pivot('foo', 'bar', 'baz') + A B C + one 1 2 3 + two 4 5 6 + + >>> df.pivot('foo', 'bar')['baz'] + A B C + one 1 2 3 + two 4 5 6 + + Returns + ------- + pivoted : DataFrame + If no values column specified, will have hierarchically indexed + columns + """ + from pandas.core.reshape import pivot + return pivot(self, index=index, columns=columns, values=values) + + def stack(self, level=-1, dropna=True): + """ + Pivot a level of the (possibly hierarchical) column labels, returning a + DataFrame (or Series in the case of an object with a single level of + column labels) having a hierarchical index with a new inner-most level + of row labels. + + Parameters + ---------- + level : int, string, or list of these, default last level + Level(s) to stack, can pass level name + dropna : boolean, default True + Whether to drop rows in the resulting Frame/Series with no valid + values + + Examples + ---------- + >>> s + a b + one 1. 2. + two 3. 4. + + >>> s.stack() + one a 1 + b 2 + two a 3 + b 4 + + Returns + ------- + stacked : DataFrame or Series + """ + from pandas.core.reshape import stack + + if isinstance(level, (tuple, list)): + result = self + for lev in level: + result = stack(result, lev, dropna=dropna) + return result + else: + return stack(self, level, dropna=dropna) + + def unstack(self, level=-1): + """ + Pivot a level of the (necessarily hierarchical) index labels, returning + a DataFrame having a new level of column labels whose inner-most level + consists of the pivoted index labels. If the index is not a MultiIndex, + the output will be a Series (the analogue of stack when the columns are + not a MultiIndex) + + Parameters + ---------- + level : int, string, or list of these, default last level + Level(s) of index to unstack, can pass level name + + Examples + -------- + >>> s + one a 1. + one b 2. + two a 3. + two b 4. + + >>> s.unstack(level=-1) + a b + one 1. 2. + two 3. 4. + + >>> df = s.unstack(level=0) + >>> df + one two + a 1. 2. + b 3. 4. + + >>> df.unstack() + one a 1. + b 3. + two a 2. + b 4. + + Returns + ------- + unstacked : DataFrame or Series + """ + from pandas.core.reshape import unstack + return unstack(self, level) + + #---------------------------------------------------------------------- + # Time series-related + + def diff(self, periods=1): + """ + 1st discrete difference of object + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming difference + + Returns + ------- + diffed : DataFrame + """ + return self - self.shift(periods) + + def shift(self, periods=1, freq=None, **kwds): + """ + Shift the index of the DataFrame by desired number of periods with an + optional time freq + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + freq : DateOffset, timedelta, or time rule string, optional + Increment to use from datetools module or time rule (e.g. 'EOM') + + Notes + ----- + If freq is specified then the index values are shifted but the data + if not realigned + + Returns + ------- + shifted : DataFrame + """ + from pandas.core.series import _resolve_offset + + if periods == 0: + return self + + offset = _resolve_offset(freq, kwds) + + if isinstance(offset, basestring): + offset = datetools.to_offset(offset) + + def _shift_block(blk, indexer): + new_values = blk.values.take(indexer, axis=1) + # convert integer to float if necessary. need to do a lot more than + # that, handle boolean etc also + new_values = com.ensure_float(new_values) + if periods > 0: + new_values[:, :periods] = nan + else: + new_values[:, periods:] = nan + return make_block(new_values, blk.items, blk.ref_items) + + if offset is None: + indexer = self._shift_indexer(periods) + new_blocks = [_shift_block(b, indexer) for b in self._data.blocks] + new_data = BlockManager(new_blocks, [self.columns, self.index]) + elif isinstance(self.index, PeriodIndex): + orig_offset = datetools.to_offset(self.index.freq) + if offset == orig_offset: + new_data = self._data.copy() + new_data.axes[1] = self.index.shift(periods) + else: + msg = ('Given freq %s does not match PeriodIndex freq %s' % + (offset.rule_code, orig_offset.rule_code)) + raise ValueError(msg) + else: + new_data = self._data.copy() + new_data.axes[1] = self.index.shift(periods, offset) + + return self._constructor(new_data) + + def _shift_indexer(self, periods): + # small reusable utility + N = len(self) + indexer = np.zeros(N, dtype=int) + + if periods > 0: + indexer[periods:] = np.arange(N - periods) + else: + indexer[:periods] = np.arange(-periods, N) + + return indexer + + #---------------------------------------------------------------------- + # Function application + + def apply(self, func, axis=0, broadcast=False, raw=False, + args=(), **kwds): + """ + Applies function along input axis of DataFrame. Objects passed to + functions are Series objects having index either the DataFrame's index + (axis=0) or the columns (axis=1). Return type depends on whether passed + function aggregates + + Parameters + ---------- + func : function + Function to apply to each column + axis : {0, 1} + 0 : apply function to each column + 1 : apply function to each row + broadcast : bool, default False + For aggregation functions, return object of same size with values + propagated + raw : boolean, default False + If False, convert each row or column into a Series. If raw=True the + passed function will receive ndarray objects instead. If you are + just applying a NumPy reduction function this will achieve much + better performance + args : tuple + Positional arguments to pass to function in addition to the + array/series + Additional keyword arguments will be passed as keywords to the function + + Examples + -------- + >>> df.apply(numpy.sqrt) # returns DataFrame + >>> df.apply(numpy.sum, axis=0) # equiv to df.sum(0) + >>> df.apply(numpy.sum, axis=1) # equiv to df.sum(1) + + Notes + ----- + To apply a function elementwise, use applymap + + Returns + ------- + applied : Series or DataFrame + """ + if len(self.columns) == 0 and len(self.index) == 0: + return self + + if kwds or args and not isinstance(func, np.ufunc): + f = lambda x: func(x, *args, **kwds) + else: + f = func + + if isinstance(f, np.ufunc): + results = f(self.values) + return self._constructor(data=results, index=self.index, + columns=self.columns, copy=False) + else: + if not broadcast: + if not all(self.shape): + is_reduction = not isinstance(f(_EMPTY_SERIES), + np.ndarray) + if is_reduction: + return Series(np.nan, + index=self._get_agg_axis(axis)) + else: + return self.copy() + + if raw and not self._is_mixed_type: + return self._apply_raw(f, axis) + else: + return self._apply_standard(f, axis) + else: + return self._apply_broadcast(f, axis) + + def _apply_raw(self, func, axis): + try: + result = lib.reduce(self.values, func, axis=axis) + except Exception: + result = np.apply_along_axis(func, axis, self.values) + + # TODO: mixed type case + if result.ndim == 2: + return DataFrame(result, index=self.index, + columns=self.columns) + else: + return Series(result, index=self._get_agg_axis(axis)) + + def _apply_standard(self, func, axis, ignore_failures=False): + try: + + assert(not self._is_mixed_type) # maybe a hack for now + values = self.values + dummy = Series(np.nan, index=self._get_axis(axis), + dtype=values.dtype) + + labels = self._get_agg_axis(axis) + result = lib.reduce(values, func, axis=axis, dummy=dummy, + labels=labels) + return Series(result, index=self._get_agg_axis(axis)) + except Exception: + pass + + if axis == 0: + series_gen = ((c, self[c]) for c in self.columns) + res_index = self.columns + res_columns = self.index + elif axis == 1: + res_index = self.index + res_columns = self.columns + series_gen = ((i, Series(v, self.columns, name=i)) + for i, v in izip(self.index, self.values)) + + results = {} + if ignore_failures: + successes = [] + for i, (k, v) in enumerate(series_gen): + try: + results[k] = func(v) + successes.append(i) + except Exception: + pass + # so will work with MultiIndex, need test + if len(successes) < len(res_index): + res_index = res_index.take(successes) + else: + try: + for k, v in series_gen: + results[k] = func(v) + except Exception, e: + try: + if hasattr(e, 'args'): + e.args = e.args + ('occurred at index %s' % str(k),) + except NameError: # pragma: no cover + # no k defined yet + pass + raise + + if len(results) > 0 and _is_sequence(results.values()[0]): + if not isinstance(results.values()[0], Series): + index = res_columns + else: + index = None + + result = self._constructor(data=results, index=index, + columns=res_index) + + if axis == 1: + result = result.T + + return result.convert_objects() + else: + return Series(results, index=res_index) + + def _apply_broadcast(self, func, axis): + if axis == 0: + target = self + elif axis == 1: + target = self.T + + result_values = np.empty_like(target.values) + columns = target.columns + for i, col in enumerate(columns): + result_values[:, i] = func(target[col]) + + result = self._constructor(result_values, index=target.index, + columns=target.columns) + + if axis == 1: + result = result.T + + return result + + def applymap(self, func): + """ + Apply a function to a DataFrame that is intended to operate + elementwise, i.e. like doing map(func, series) for each series in the + DataFrame + + Parameters + ---------- + func : function + Python function, returns a single value from a single value + + Returns + ------- + applied : DataFrame + """ + return self.apply(lambda x: lib.map_infer(x, func)) + + #---------------------------------------------------------------------- + # Merging / joining methods + + def append(self, other, ignore_index=False, verify_integrity=False): + """ + Append columns of other to end of this frame's columns and index, + returning a new object. Columns not in this frame are added as new + columns. + + Parameters + ---------- + other : DataFrame or list of Series/dict-like objects + ignore_index : boolean, default False + If True do not use the index labels. Useful for gluing together + record arrays + verify_integrity : boolean, default False + If True, raise Exception on creating index with duplicates + + Notes + ----- + If a list of dict is passed and the keys are all contained in the + DataFrame's index, the order of the columns in the resulting DataFrame + will be unchanged + + Returns + ------- + appended : DataFrame + """ + if isinstance(other, (Series, dict)): + if isinstance(other, dict): + other = Series(other) + if other.name is None and not ignore_index: + raise Exception('Can only append a Series if ' + 'ignore_index=True') + + index = None if other.name is None else [other.name] + other = other.reindex(self.columns, copy=False) + other = DataFrame(other.values.reshape((1, len(other))), + index=index, columns=self.columns) + elif isinstance(other, list) and not isinstance(other[0], DataFrame): + other = DataFrame(other) + if (self.columns.get_indexer(other.columns) >= 0).all(): + other = other.ix[:, self.columns] + + from pandas.tools.merge import concat + if isinstance(other, (list, tuple)): + to_concat = [self] + other + else: + to_concat = [self, other] + return concat(to_concat, ignore_index=ignore_index, + verify_integrity=verify_integrity) + + def join(self, other, on=None, how='left', lsuffix='', rsuffix='', + sort=False): + """ + Join columns with other DataFrame either on index or on a key + column. Efficiently Join multiple DataFrame objects by index at once by + passing a list. + + Parameters + ---------- + other : DataFrame, Series with name field set, or list of DataFrame + Index should be similar to one of the columns in this one. If a + Series is passed, its name attribute must be set, and that will be + used as the column name in the resulting joined DataFrame + on : column name, tuple/list of column names, or array-like + Column(s) to use for joining, otherwise join on index. If multiples + columns given, the passed DataFrame must have a MultiIndex. Can + pass an array as the join key if not already contained in the + calling DataFrame. Like an Excel VLOOKUP operation + how : {'left', 'right', 'outer', 'inner'} + How to handle indexes of the two objects. Default: 'left' + for joining on index, None otherwise + * left: use calling frame's index + * right: use input frame's index + * outer: form union of indexes + * inner: use intersection of indexes + lsuffix : string + Suffix to use from left frame's overlapping columns + rsuffix : string + Suffix to use from right frame's overlapping columns + sort : boolean, default False + Order result DataFrame lexicographically by the join key. If False, + preserves the index order of the calling (left) DataFrame + + Notes + ----- + on, lsuffix, and rsuffix options are not supported when passing a list + of DataFrame objects + + Returns + ------- + joined : DataFrame + """ + # For SparseDataFrame's benefit + return self._join_compat(other, on=on, how=how, lsuffix=lsuffix, + rsuffix=rsuffix, sort=sort) + + def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', + sort=False): + from pandas.tools.merge import merge, concat + + if isinstance(other, Series): + assert(other.name is not None) + other = DataFrame({other.name: other}) + + if isinstance(other, DataFrame): + return merge(self, other, left_on=on, how=how, + left_index=on is None, right_index=True, + suffixes=(lsuffix, rsuffix), sort=sort) + else: + if on is not None: + raise ValueError('Joining multiple DataFrames only supported' + ' for joining on index') + + # join indexes only using concat + if how == 'left': + how = 'outer' + join_axes = [self.index] + else: + join_axes = None + + frames = [self] + list(other) + + can_concat = all(df.index.is_unique for df in frames) + + if can_concat: + return concat(frames, axis=1, join=how, join_axes=join_axes, + verify_integrity=True) + + joined = frames[0] + + for frame in frames[1:]: + joined = merge(joined, frame, how=how, + left_index=True, right_index=True) + + return joined + + @Substitution('') + @Appender(_merge_doc, indents=2) + def merge(self, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True): + from pandas.tools.merge import merge + return merge(self, right, how=how, on=on, + left_on=left_on, right_on=right_on, + left_index=left_index, right_index=right_index, sort=sort, + suffixes=suffixes, copy=copy) + + #---------------------------------------------------------------------- + # Statistical methods, etc. + + def corr(self, method='pearson'): + """ + Compute pairwise correlation of columns, excluding NA/null values + + Parameters + ---------- + method : {'pearson', 'kendall', 'spearman'} + pearson : standard correlation coefficient + kendall : Kendall Tau correlation coefficient + spearman : Spearman rank correlation + + Returns + ------- + y : DataFrame + """ + numeric_df = self._get_numeric_data() + cols = numeric_df.columns + mat = numeric_df.values + + if method == 'pearson': + correl = lib.nancorr(mat) + else: + mat = mat.T + corrf = nanops.get_corr_func(method) + K = len(cols) + correl = np.empty((K, K), dtype=float) + mask = np.isfinite(mat) + for i, ac in enumerate(mat): + for j, bc in enumerate(mat): + valid = mask[i] & mask[j] + if not valid.all(): + c = corrf(ac[valid], bc[valid]) + else: + c = corrf(ac, bc) + correl[i, j] = c + correl[j, i] = c + + return self._constructor(correl, index=cols, columns=cols) + + def cov(self): + """ + Compute pairwise covariance of columns, excluding NA/null values + + Returns + ------- + y : DataFrame + """ + numeric_df = self._get_numeric_data() + cols = numeric_df.columns + mat = numeric_df.values + + if notnull(mat).all(): + baseCov = np.cov(mat.T) + else: + baseCov = lib.nancorr(mat, cov=True) + + return self._constructor(baseCov, index=cols, columns=cols) + + def corrwith(self, other, axis=0, drop=False): + """ + Compute pairwise correlation between rows or columns of two DataFrame + objects. + + Parameters + ---------- + other : DataFrame + axis : {0, 1} + 0 to compute column-wise, 1 for row-wise + drop : boolean, default False + Drop missing indices from result, default returns union of all + + Returns + ------- + correls : Series + """ + if isinstance(other, Series): + return self.apply(other.corr, axis=axis) + + this = self._get_numeric_data() + other = other._get_numeric_data() + + left, right = this.align(other, join='inner', copy=False) + + # mask missing values + left = left + right * 0 + right = right + left * 0 + + if axis == 1: + left = left.T + right = right.T + + # demeaned data + ldem = left - left.mean() + rdem = right - right.mean() + + num = (ldem * rdem).sum() + dom = (left.count() - 1) * left.std() * right.std() + + correl = num / dom + + if not drop: + raxis = 1 if axis == 0 else 0 + result_index = this._get_axis(raxis).union(other._get_axis(raxis)) + correl = correl.reindex(result_index) + + return correl + + def describe(self, percentile_width=50): + """ + Generate various summary statistics of each column, excluding + NaN values. These include: count, mean, std, min, max, and + lower%/50%/upper% percentiles + + Parameters + ---------- + percentile_width : float, optional + width of the desired uncertainty interval, default is 50, + which corresponds to lower=25, upper=75 + + Returns + ------- + DataFrame of summary statistics + """ + numdata = self._get_numeric_data() + + if len(numdata.columns) == 0: + return DataFrame(dict((k, v.describe()) + for k, v in self.iteritems()), + columns=self.columns) + + lb = .5 * (1. - percentile_width/100.) + ub = 1. - lb + + def pretty_name(x): + x *= 100 + if x == int(x): + return '%.0f%%' % x + else: + return '%.1f%%' % x + + destat_columns = ['count', 'mean', 'std', 'min', + pretty_name(lb), '50%', pretty_name(ub), + 'max'] + + destat = [] + + for column in numdata.columns: + series = self[column] + ser_desc = series.describe() + destat.append([series.count(), series.mean(), series.std(), + series.min(), series.quantile(lb), series.median(), + series.quantile(ub), series.max()]) + + return self._constructor(map(list, zip(*destat)), index=destat_columns, + columns=numdata.columns) + + #---------------------------------------------------------------------- + # ndarray-like stats methods + + def count(self, axis=0, level=None, numeric_only=False): + """ + Return Series with number of non-NA/null observations over requested + axis. Works with non-floating point data as well (detects NaN and None) + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + level : int, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame + numeric_only : boolean, default False + Include only float, int, boolean data + + Returns + ------- + count : Series (or DataFrame if level specified) + """ + if level is not None: + return self._count_level(level, axis=axis, + numeric_only=numeric_only) + + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + # GH #423 + if len(frame._get_axis(axis)) == 0: + result = Series(0, index=frame._get_agg_axis(axis)) + else: + if axis == 1: + counts = notnull(frame.values).sum(1) + result = Series(counts, index=frame._get_agg_axis(axis)) + else: + result = DataFrame.apply(frame, Series.count, axis=axis) + + return result + + def _count_level(self, level, axis=0, numeric_only=False): + if numeric_only: + frame = self._get_numeric_data() + else: + frame = self + + if axis == 1: + frame = frame.T + + # python 2.5 + mask = notnull(frame.values).view(np.uint8) + + if isinstance(level, basestring): + level = self.index._get_level_number(level) + + level_index = frame.index.levels[level] + labels = com._ensure_int64(frame.index.labels[level]) + counts = lib.count_level_2d(mask, labels, len(level_index)) + + result = DataFrame(counts, index=level_index, + columns=frame.columns) + + if axis == 1: + return result.T + else: + return result + + def any(self, axis=0, bool_only=None, skipna=True, level=None): + """ + Return whether any element is True over requested axis. + %(na_action)s + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + level : int, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame + bool_only : boolean, default None + Only include boolean data. + + Returns + ------- + any : Series (or DataFrame if level specified) + """ + if level is not None: + return self._agg_by_level('any', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanany, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') + + def all(self, axis=0, bool_only=None, skipna=True, level=None): + """ + Return whether any element is True over requested axis. + %(na_action)s + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + level : int, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a DataFrame + bool_only : boolean, default None + Only include boolean data. + + Returns + ------- + any : Series (or DataFrame if level specified) + """ + if level is not None: + return self._agg_by_level('all', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanall, axis=axis, skipna=skipna, + numeric_only=bool_only, filter_type='bool') + + @Substitution(name='sum', shortname='sum', na_action=_doc_exclude_na, + extras=_numeric_only_doc) + @Appender(_stat_doc) + def sum(self, axis=0, numeric_only=None, skipna=True, level=None): + if level is not None: + return self._agg_by_level('sum', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nansum, axis=axis, skipna=skipna, + numeric_only=numeric_only) + + @Substitution(name='mean', shortname='mean', na_action=_doc_exclude_na, + extras='') + @Appender(_stat_doc) + def mean(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('mean', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanmean, axis=axis, skipna=skipna, + numeric_only=None) + + @Substitution(name='minimum', shortname='min', na_action=_doc_exclude_na, + extras='') + @Appender(_stat_doc) + def min(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('min', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanmin, axis=axis, skipna=skipna, + numeric_only=None) + + @Substitution(name='maximum', shortname='max', na_action=_doc_exclude_na, + extras='') + @Appender(_stat_doc) + def max(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('max', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanmax, axis=axis, skipna=skipna, + numeric_only=None) + + @Substitution(name='product', shortname='product', + na_action='NA/null values are treated as 1', extras='') + @Appender(_stat_doc) + def prod(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('prod', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanprod, axis=axis, skipna=skipna, + numeric_only=None) + + product = prod + + @Substitution(name='median', shortname='median', na_action=_doc_exclude_na, + extras='') + @Appender(_stat_doc) + def median(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('median', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanmedian, axis=axis, skipna=skipna, + numeric_only=None) + + @Substitution(name='mean absolute deviation', shortname='mad', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def mad(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('mad', axis=axis, level=level, + skipna=skipna) + + frame = self._get_numeric_data() + + if axis == 0: + demeaned = frame - frame.mean(axis=0) + else: + demeaned = frame.sub(frame.mean(axis=1), axis=0) + return np.abs(demeaned).mean(axis=axis, skipna=skipna) + + @Substitution(name='variance', shortname='var', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def var(self, axis=0, skipna=True, level=None, ddof=1): + if level is not None: + return self._agg_by_level('var', axis=axis, level=level, + skipna=skipna, ddof=ddof) + return self._reduce(nanops.nanvar, axis=axis, skipna=skipna, + numeric_only=None, ddof=ddof) + + @Substitution(name='standard deviation', shortname='std', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def std(self, axis=0, skipna=True, level=None, ddof=1): + if level is not None: + return self._agg_by_level('std', axis=axis, level=level, + skipna=skipna, ddof=ddof) + return np.sqrt(self.var(axis=axis, skipna=skipna, ddof=ddof)) + + @Substitution(name='unbiased skewness', shortname='skew', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def skew(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('skew', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nanskew, axis=axis, skipna=skipna, + numeric_only=None) + + + @Substitution(name='unbiased kurtosis', shortname='kurt', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def kurt(self, axis=0, skipna=True, level=None): + if level is not None: + return self._agg_by_level('kurt', axis=axis, level=level, + skipna=skipna) + return self._reduce(nanops.nankurt, axis=axis, skipna=skipna, + numeric_only=None) + + def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwds): + grouped = self.groupby(level=level, axis=axis) + if hasattr(grouped, name) and skipna: + return getattr(grouped, name)(**kwds) + method = getattr(type(self), name) + applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwds) + return grouped.aggregate(applyf) + + def _reduce(self, op, axis=0, skipna=True, numeric_only=None, + filter_type=None, **kwds): + f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) + labels = self._get_agg_axis(axis) + if numeric_only is None: + try: + values = self.values + result = f(values) + except Exception: + if filter_type is None or filter_type == 'numeric': + data = self._get_numeric_data() + elif filter_type == 'bool': + data = self._get_bool_data() + else: + raise ValueError('Invalid filter_type %s ' % + str(filter_type)) + result = f(data.values) + labels = data._get_agg_axis(axis) + else: + if numeric_only: + if filter_type is None or filter_type == 'numeric': + data = self._get_numeric_data() + elif filter_type == 'bool': + data = self._get_bool_data() + else: + raise ValueError('Invalid filter_type %s ' % + str(filter_type)) + values = data.values + labels = data._get_agg_axis(axis) + else: + values = self.values + result = f(values) + + if result.dtype == np.object_: + try: + if filter_type is None or filter_type == 'numeric': + result = result.astype(np.float64) + elif filter_type == 'bool' and notnull(result).all(): + result = result.astype(np.bool_) + else: + raise ValueError('Invalid dtype %s ' % str(filter_type)) + + except (ValueError, TypeError): + pass + + return Series(result, index=labels) + + def idxmin(self, axis=0, skipna=True): + """ + Return index of first occurence of minimum over requested axis. + NA/null values are excluded. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + + Returns + ------- + idxmin : Series + """ + indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return Series(result, index=self._get_agg_axis(axis)) + + def idxmax(self, axis=0, skipna=True): + """ + Return index of first occurence of maximum over requested axis. + NA/null values are excluded. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be first index. + + Returns + ------- + idxmax : Series + """ + indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) + index = self._get_axis(axis) + result = [index[i] if i >= 0 else np.nan for i in indices] + return Series(result, index=self._get_agg_axis(axis)) + + def _get_agg_axis(self, axis_num): + if axis_num == 0: + return self.columns + elif axis_num == 1: + return self.index + else: + raise Exception('Must have 0<= axis <= 1') + + def _get_numeric_data(self): + if self._is_mixed_type: + num_data = self._data.get_numeric_data() + return DataFrame(num_data, copy=False) + else: + if (self.values.dtype != np.object_ and + not issubclass(self.values.dtype.type, np.datetime64)): + return self + else: + return self.ix[:, []] + + def _get_bool_data(self): + if self._is_mixed_type: + bool_data = self._data.get_bool_data() + return DataFrame(bool_data, copy=False) + else: + if self.values.dtype == np.bool_: + return self + else: + return self.ix[:, []] + + def quantile(self, q=0.5, axis=0): + """ + Return values at the given quantile over requested axis, a la + scoreatpercentile in scipy.stats + + Parameters + ---------- + q : quantile, default 0.5 (50% quantile) + 0 <= q <= 1 + axis : {0, 1} + 0 for row-wise, 1 for column-wise + + Returns + ------- + quantiles : Series + """ + per = q * 100 + + def f(arr): + arr = arr.values + if arr.dtype != np.float_: + arr = arr.astype(float) + arr = arr[notnull(arr)] + if len(arr) == 0: + return nan + else: + return _quantile(arr, per) + + return self.apply(f, axis=axis) + + def clip(self, upper=None, lower=None): + """ + Trim values at input threshold(s) + + Parameters + ---------- + lower : float, default None + upper : float, default None + + Returns + ------- + clipped : DataFrame + """ + return self.apply(lambda x: x.clip(lower=lower, upper=upper)) + + def clip_upper(self, threshold): + """ + Trim values above threshold + + Returns + ------- + clipped : DataFrame + """ + return self.apply(lambda x: x.clip_upper(threshold)) + + def clip_lower(self, threshold): + """ + Trim values below threshold + + Returns + ------- + clipped : DataFrame + """ + return self.apply(lambda x: x.clip_lower(threshold)) + + def rank(self, axis=0, numeric_only=None, method='average', + na_option='keep', ascending=True): + """ + Compute numerical data ranks (1 through n) along axis. Equal values are + assigned a rank that is the average of the ranks of those values + + Parameters + ---------- + axis : {0, 1}, default 0 + Ranks over columns (0) or rows (1) + numeric_only : boolean, default None + Include only float, int, boolean data + method : {'average', 'min', 'max', 'first'} + average: average rank of group + min: lowest rank in group + max: highest rank in group + first: ranks assigned in order they appear in the array + na_option : {'keep'} + keep: leave NA values where they are + ascending : boolean, default True + False for ranks by high (1) to low (N) + + Returns + ------- + ranks : DataFrame + """ + if numeric_only is None: + try: + ranks = algos.rank(self.values, axis=axis, method=method, + ascending=ascending) + return DataFrame(ranks, index=self.index, columns=self.columns) + except TypeError: + numeric_only = True + + if numeric_only: + data = self._get_numeric_data() + else: + data = self + ranks = algos.rank(data.values, axis=axis, method=method, + ascending=ascending) + return DataFrame(ranks, index=data.index, columns=data.columns) + + def to_timestamp(self, freq=None, how='start', axis=0, copy=True): + """ + Cast to DatetimeIndex of timestamps, at *beginning* of period + + Parameters + ---------- + freq : string, default frequency of PeriodIndex + Desired frequency + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end + axis : {0, 1} default 0 + The axis to convert (the index by default) + copy : boolean, default True + If false then underlying input data is not copied + + Returns + ------- + df : DataFrame with DatetimeIndex + """ + new_data = self._data + if copy: + new_data = new_data.copy() + + if axis == 0: + new_data.set_axis(1, self.index.to_timestamp(freq=freq, how=how)) + elif axis == 1: + new_data.set_axis(0, self.columns.to_timestamp(freq=freq, how=how)) + else: + raise ValueError('Axis must be 0 or 1. Got %s' % str(axis)) + + return DataFrame(new_data) + + def to_period(self, freq=None, axis=0, copy=True): + """ + Convert DataFrame from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed) + + Parameters + ---------- + freq : string, default + axis : {0, 1}, default 0 + The axis to convert (the index by default) + copy : boolean, default True + If False then underlying input data is not copied + + Returns + ------- + ts : TimeSeries with PeriodIndex + """ + new_data = self._data + if copy: + new_data = new_data.copy() + + if axis == 0: + if freq is None: + freq = self.index.freqstr or self.index.inferred_freq + new_data.set_axis(1, self.index.to_period(freq=freq)) + elif axis == 1: + if freq is None: + freq = self.columns.freqstr or self.columns.inferred_freq + new_data.set_axis(0, self.columns.to_period(freq=freq)) + else: + raise ValueError('Axis must be 0 or 1. Got %s' % str(axis)) + + return DataFrame(new_data) + + #---------------------------------------------------------------------- + # Deprecated stuff + + def combineAdd(self, other): + """ + Add two DataFrame objects and do not propagate + NaN values, so if for a (column, time) one frame is missing a + value, it will default to the other frame's value (which might + be NaN as well) + + Parameters + ---------- + other : DataFrame + + Returns + ------- + DataFrame + """ + return self.add(other, fill_value=0.) + + def combineMult(self, other): + """ + Multiply two DataFrame objects and do not propagate NaN values, so if + for a (column, time) one frame is missing a value, it will default to + the other frame's value (which might be NaN as well) + + Parameters + ---------- + other : DataFrame + + Returns + ------- + DataFrame + """ + return self.mul(other, fill_value=1.) + + +_EMPTY_SERIES = Series([]) + + +def group_agg(values, bounds, f): + """ + R-style aggregator + + Parameters + ---------- + values : N-length or N x K ndarray + bounds : B-length ndarray + f : ndarray aggregation function + + Returns + ------- + ndarray with same length as bounds array + """ + if values.ndim == 1: + N = len(values) + result = np.empty(len(bounds), dtype=float) + elif values.ndim == 2: + N, K = values.shape + result = np.empty((len(bounds), K), dtype=float) + + testagg = f(values[:min(1, len(values))]) + if isinstance(testagg, np.ndarray) and testagg.ndim == 2: + raise Exception('Passed function does not aggregate!') + + for i, left_bound in enumerate(bounds): + if i == len(bounds) - 1: + right_bound = N + else: + right_bound = bounds[i + 1] + + result[i] = f(values[left_bound:right_bound]) + + return result + + +def factor_agg(factor, vec, func): + """ + Aggregate array based on Factor + + Parameters + ---------- + factor : Factor + length n + vec : sequence + length n + func : function + 1D array aggregation function + + Returns + ------- + ndarray corresponding to Factor levels + """ + indexer = np.argsort(factor.labels) + unique_labels = np.arange(len(factor.levels)) + + ordered_labels = factor.labels.take(indexer) + ordered_vec = np.asarray(vec).take(indexer) + bounds = ordered_labels.searchsorted(unique_labels) + + return group_agg(ordered_vec, bounds, func) + + +def extract_index(data): + from pandas.core.index import _union_indexes + + index = None + if len(data) == 0: + index = Index([]) + elif len(data) > 0 and index is None: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_series = False + have_dicts = False + + for v in data.values(): + if isinstance(v, Series): + have_series = True + indexes.append(v.index) + elif isinstance(v, dict): + have_dicts = True + indexes.append(v.keys()) + elif isinstance(v, (list, tuple, np.ndarray)): + have_raw_arrays = True + raw_lengths.append(len(v)) + + if not indexes and not raw_lengths: + raise ValueError('If use all scalar values, must pass index') + + if have_series or have_dicts: + index = _union_indexes(indexes) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError('arrays must all be same length') + + if have_dicts: + raise ValueError('Mixing dicts with non-Series may lead to ' + 'ambiguous ordering.') + + if have_series: + assert(lengths[0] == len(index)) + else: + index = Index(np.arange(lengths[0])) + + return _ensure_index(index) + + +def _prep_ndarray(values, copy=True): + if not isinstance(values, np.ndarray): + arr = np.asarray(values) + # NumPy strings are a pain, convert to object + if issubclass(arr.dtype.type, basestring): + arr = np.array(values, dtype=object, copy=True) + values = arr + else: + # drop subclass info, do not copy data + values = np.asarray(values) + if copy: + values = values.copy() + + if values.ndim == 1: + N = values.shape[0] + if N == 0: + values = values.reshape((values.shape[0], 0)) + else: + values = values.reshape((values.shape[0], 1)) + elif values.ndim != 2: + raise Exception('Must pass 2-d input') + + return values + + +def _rec_to_dict(arr): + if isinstance(arr, np.ndarray): + columns = list(arr.dtype.names) + sdict = dict((k, arr[k]) for k in columns) + elif isinstance(arr, DataFrame): + columns = list(arr.columns) + sdict = dict((k, v.values) for k, v in arr.iteritems()) + elif isinstance(arr, dict): + columns = sorted(arr) + sdict = arr.copy() + else: # pragma: no cover + raise TypeError('%s' % type(arr)) + + return columns, sdict + + +def _to_sdict(data, columns, coerce_float=False): + if len(data) == 0: + return {}, columns + if isinstance(data[0], (list, tuple)): + return _list_to_sdict(data, columns, coerce_float=coerce_float) + elif isinstance(data[0], dict): + return _list_of_dict_to_sdict(data, columns, coerce_float=coerce_float) + elif isinstance(data[0], Series): + return _list_of_series_to_sdict(data, columns, + coerce_float=coerce_float) + else: + # last ditch effort + data = map(tuple, data) + return _list_to_sdict(data, columns, coerce_float=coerce_float) + +def _list_to_sdict(data, columns, coerce_float=False): + if len(data) > 0 and isinstance(data[0], tuple): + content = list(lib.to_object_array_tuples(data).T) + elif len(data) > 0: + # list of lists + content = list(lib.to_object_array(data).T) + else: + if columns is None: + columns = [] + return {}, columns + return _convert_object_array(content, columns, + coerce_float=coerce_float) + +def _list_of_series_to_sdict(data, columns, coerce_float=False): + from pandas.core.index import _get_combined_index + + if columns is None: + columns = _get_combined_index([s.index for s in data]) + + indexer_cache = {} + + aligned_values = [] + for s in data: + index = s.index + if id(index) in indexer_cache: + indexer = indexer_cache[id(index)] + else: + indexer = indexer_cache[id(index)] = index.get_indexer(columns) + aligned_values.append(com.take_1d(s.values, indexer)) + + values = np.vstack(aligned_values) + + if values.dtype == np.object_: + content = list(values.T) + return _convert_object_array(content, columns, + coerce_float=coerce_float) + else: + return values, columns + + +def _list_of_dict_to_sdict(data, columns, coerce_float=False): + if columns is None: + gen = (x.keys() for x in data) + columns = lib.fast_unique_multiple_list_gen(gen) + + # assure that they are of the base dict class and not of derived + # classes + data = [(type(d) is dict) and d or dict(d) + for d in data] + + content = list(lib.dicts_to_array(data, list(columns)).T) + return _convert_object_array(content, columns, + coerce_float=coerce_float) + + +def _convert_object_array(content, columns, coerce_float=False): + if columns is None: + columns = range(len(content)) + else: + if len(columns) != len(content): + raise AssertionError('%d columns passed, passed data had %s ' + 'columns' % (len(columns), len(content))) + + sdict = dict((c, lib.maybe_convert_objects(vals, try_float=coerce_float)) + for c, vals in zip(columns, content)) + return sdict, columns + +def _get_names_from_index(data): + index = range(len(data)) + has_some_name = any([s.name is not None for s in data]) + if not has_some_name: + return index + + count = 0 + for i, s in enumerate(data): + n = s.name + if n is not None: + index[i] = n + else: + index[i] = 'Unnamed %d' % count + count += 1 + + return index + +def _homogenize(data, index, columns, dtype=None): + from pandas.core.series import _sanitize_array + + homogenized = {} + + if dtype is not None: + dtype = np.dtype(dtype) + + oindex = None + + for k in columns: + if k not in data: + # no obvious "empty" int column + if dtype is not None and issubclass(dtype.type, np.integer): + continue + + v = np.empty(len(index), dtype=dtype) + v.fill(nan) + else: + v = data[k] + + if isinstance(v, Series): + if dtype is not None: + v = v.astype(dtype) + if v.index is not index: + # Forces alignment. No need to copy data since we + # are putting it into an ndarray later + v = v.reindex(index, copy=False) + else: + if isinstance(v, dict): + if oindex is None: + oindex = index.astype('O') + if type(v) == dict: + # fast cython method + v = lib.fast_multiget(v, oindex, default=np.nan) + else: + v = lib.map_infer(oindex, v.get) + + v = _sanitize_array(v, index, dtype=dtype, copy=False, + raise_cast_failure=False) + + homogenized[k] = v + + return homogenized + + +def _put_str(s, space): + return ('%s' % s)[:space].ljust(space) + +def _is_sequence(x): + try: + iter(x) + assert(not isinstance(x, basestring)) + return True + except Exception: + return False + +def install_ipython_completers(): # pragma: no cover + """Register the DataFrame type with IPython's tab completion machinery, so + that it knows about accessing column names as attributes.""" + from IPython.utils.generics import complete_object + + @complete_object.when_type(DataFrame) + def complete_dataframe(obj, prev_completions): + return prev_completions + [c for c in obj.columns \ + if isinstance(c, basestring) and py3compat.isidentifier(c)] + + +# Importing IPython brings in about 200 modules, so we want to avoid it unless +# we're in IPython (when those modules are loaded anyway). +if "IPython" in sys.modules: # pragma: no cover + try: + install_ipython_completers() + except Exception: + pass + +#---------------------------------------------------------------------- +# Add plotting methods to DataFrame + +import pandas.tools.plotting as gfx + +DataFrame.plot = gfx.plot_frame +DataFrame.hist = gfx.hist_frame + +def boxplot(self, column=None, by=None, ax=None, fontsize=None, + rot=0, grid=True, **kwds): + """ + Make a box plot from DataFrame column/columns optionally grouped + (stratified) by one or more columns + + Parameters + ---------- + data : DataFrame + column : column names or list of names, or vector + Can be any valid input to groupby + by : string or sequence + Column in the DataFrame to group by + fontsize : int or string + + Returns + ------- + ax : matplotlib.axes.AxesSubplot + """ + import pandas.tools.plotting as plots + import matplotlib.pyplot as plt + ax = plots.boxplot(self, column=column, by=by, ax=ax, + fontsize=fontsize, grid=grid, rot=rot, **kwds) + plt.draw_if_interactive() + return ax +DataFrame.boxplot = boxplot + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/core/generic.py b/pandas/core/generic.py new file mode 100644 index 00000000..f0061044 --- /dev/null +++ b/pandas/core/generic.py @@ -0,0 +1,967 @@ +# pylint: disable=W0231,E1101 +from datetime import timedelta + +import numpy as np + +from pandas.core.index import MultiIndex +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.offsets import DateOffset +import pandas.core.common as com + +class PandasError(Exception): + pass + + +class PandasObject(object): + + _AXIS_NUMBERS = { + 'index' : 0, + 'columns' : 1 + } + + _AXIS_ALIASES = {} + _AXIS_NAMES = dict((v, k) for k, v in _AXIS_NUMBERS.iteritems()) + + def save(self, path): + com.save(self, path) + + @classmethod + def load(cls, path): + return com.load(path) + + #---------------------------------------------------------------------- + # Axis name business + + @classmethod + def _get_axis_number(cls, axis): + axis = cls._AXIS_ALIASES.get(axis, axis) + + if isinstance(axis, int): + if axis in cls._AXIS_NAMES: + return axis + else: + raise Exception('No %d axis' % axis) + else: + return cls._AXIS_NUMBERS[axis] + + @classmethod + def _get_axis_name(cls, axis): + axis = cls._AXIS_ALIASES.get(axis, axis) + if isinstance(axis, basestring): + if axis in cls._AXIS_NUMBERS: + return axis + else: + raise Exception('No axis named %s' % axis) + else: + return cls._AXIS_NAMES[axis] + + def _get_axis(self, axis): + name = self._get_axis_name(axis) + return getattr(self, name) + + def abs(self): + """ + Return an object with absolute value taken. Only applicable to objects + that are all numeric + + Returns + ------- + abs: type of caller + """ + return np.abs(self) + + def get(self, key, default=None): + """ + Get item from object for given key (DataFrame column, Panel slice, + etc.). Returns default value if not found + + Parameters + ---------- + key : object + + Returns + ------- + value : type of items contained in object + """ + try: + return self[key] + except KeyError: + return default + + def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, + group_keys=True): + """ + Group series using mapper (dict or key function, apply given function + to group, return result as series) or by a series of columns + + Parameters + ---------- + by : mapping function / list of functions, dict, Series, or tuple / + list of column names. + Called on each element of the object index to determine the groups. + If a dict or Series is passed, the Series or dict VALUES will be + used to determine the groups + axis : int, default 0 + level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels + as_index : boolean, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output + sort : boolean, default True + Sort group keys. Get better performance by turning this off + group_keys : boolean, default True + When calling apply, add group keys to index to identify pieces + + Examples + -------- + # DataFrame result + >>> data.groupby(func, axis=0).mean() + + # DataFrame result + >>> data.groupby(['col1', 'col2'])['col3'].mean() + + # DataFrame with hierarchical index + >>> data.groupby(['col1', 'col2']).mean() + + Returns + ------- + GroupBy object + """ + from pandas.core.groupby import groupby + return groupby(self, by, axis=axis, level=level, as_index=as_index, + sort=sort, group_keys=group_keys) + + def asfreq(self, freq, method=None, how=None): + """ + Convert all TimeSeries inside to specified frequency using DateOffset + objects. Optionally provide fill method to pad/backfill missing values. + + Parameters + ---------- + freq : DateOffset object, or string + method : {'backfill', 'bfill', 'pad', 'ffill', None} + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill methdo + how : {'start', 'end'}, default end + For PeriodIndex only, see PeriodIndex.asfreq + + Returns + ------- + converted : type of caller + """ + from pandas.tseries.resample import asfreq + return asfreq(self, freq, method=method, how=how) + + def resample(self, rule, how='mean', axis=0, fill_method=None, + closed='right', label='right', convention=None, + kind=None, loffset=None, limit=None, base=0): + """ + Convenience method for frequency conversion and resampling of regular + time-series data. + + Parameters + ---------- + rule : the offset string or object representing target conversion + how : string, method for down- or re-sampling, default 'mean' + fill_method : string, fill_method for upsampling, default None + axis : int, optional, default 0 + closed : {'right', 'left'}, default 'right' + Which side of bin interval is closed + label : {'right', 'left'}, default 'right' + Which bin edge label to label bucket with + convention : {'start', 'end', 's', 'e'} + loffset : timedelta + Adjust the resampled time labels + base : int, default 0 + For frequencies that evenly subdivide 1 day, the "origin" of the + aggregated intervals. For example, for '5min' frequency, base could + range from 0 through 4. Defaults to 0 + """ + from pandas.tseries.resample import TimeGrouper + sampler = TimeGrouper(rule, label=label, closed=closed, how=how, + axis=axis, kind=kind, loffset=loffset, + fill_method=fill_method, convention=convention, + limit=limit, base=base) + return sampler.resample(self) + + def first(self, offset): + """ + Convenience method for subsetting initial periods of time series data + based on a date offset + + Parameters + ---------- + offset : string, DateOffset, dateutil.relativedelta + + Examples + -------- + ts.last('10D') -> First 10 days + + Returns + ------- + subset : type of caller + """ + from pandas.tseries.frequencies import to_offset + if not isinstance(self.index, DatetimeIndex): + raise NotImplementedError + + if len(self.index) == 0: + return self + + offset = to_offset(offset) + end_date = end = self.index[0] + offset + + # Tick-like, e.g. 3 weeks + if not offset.isAnchored() and hasattr(offset, '_inc'): + if end_date in self.index: + end = self.index.searchsorted(end_date, side='left') + + return self.ix[:end] + + def last(self, offset): + """ + Convenience method for subsetting final periods of time series data + based on a date offset + + Parameters + ---------- + offset : string, DateOffset, dateutil.relativedelta + + Examples + -------- + ts.last('5M') -> Last 5 months + + Returns + ------- + subset : type of caller + """ + from pandas.tseries.frequencies import to_offset + if not isinstance(self.index, DatetimeIndex): + raise NotImplementedError + + if len(self.index) == 0: + return self + + offset = to_offset(offset) + + start_date = start = self.index[-1] - offset + start = self.index.searchsorted(start_date, side='right') + return self.ix[start:] + + def select(self, crit, axis=0): + """ + Return data corresponding to axis labels matching criteria + + Parameters + ---------- + crit : function + To be called on each index (label). Should return True or False + axis : int + + Returns + ------- + selection : type of caller + """ + axis_name = self._get_axis_name(axis) + axis = self._get_axis(axis) + + if len(axis) > 0: + new_axis = axis[np.asarray([crit(label) for label in axis])] + else: + new_axis = axis + + return self.reindex(**{axis_name : new_axis}) + + def drop(self, labels, axis=0, level=None): + """ + Return new object with labels in requested axis removed + + Parameters + ---------- + labels : array-like + axis : int + level : int or name, default None + For MultiIndex + + Returns + ------- + dropped : type of caller + """ + axis_name = self._get_axis_name(axis) + axis = self._get_axis(axis) + + if level is not None: + assert(isinstance(axis, MultiIndex)) + new_axis = axis.drop(labels, level=level) + else: + new_axis = axis.drop(labels) + + return self.reindex(**{axis_name : new_axis}) + + def sort_index(self, axis=0, ascending=True): + """ + Sort object by labels (along an axis) + + Parameters + ---------- + axis : {0, 1} + Sort index/rows versus columns + ascending : boolean, default True + Sort ascending vs. descending + + Returns + ------- + sorted_obj : type of caller + """ + axis = self._get_axis_number(axis) + axis_name = self._get_axis_name(axis) + labels = self._get_axis(axis) + + sort_index = labels.argsort() + if not ascending: + sort_index = sort_index[::-1] + + new_axis = labels.take(sort_index) + return self.reindex(**{axis_name : new_axis}) + + @property + def ix(self): + raise NotImplementedError + + def reindex(self, *args, **kwds): + raise NotImplementedError + + def tshift(self, periods=1, freq=None, **kwds): + """ + Shift the time index, using the index's frequency if available + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + freq : DateOffset, timedelta, or time rule string, default None + Increment to use from datetools module or time rule (e.g. 'EOM') + + Notes + ----- + If freq is not specified then tries to use the freq or inferred_freq + attributes of the index. If neither of those attributes exist, a + ValueError is thrown + + Returns + ------- + shifted : Series + """ + if freq is None: + freq = getattr(self.index, 'freq', None) + + if freq is None: + freq = getattr(self.index, 'inferred_freq', None) + + if freq is None: + msg = 'Freq was not given and was not set in the index' + raise ValueError(msg) + + return self.shift(periods, freq, **kwds) + + def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, + **kwds): + """ + Percent change over given number of periods + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming percent change + fill_method : str, default 'pad' + How to handle NAs before computing percent changes + limit : int, default None + The number of consecutive NAs to fill before stopping + freq : DateOffset, timedelta, or offset alias string, optional + Increment to use from time series API (e.g. 'M' or BDay()) + + Returns + ------- + chg : Series or DataFrame + """ + if fill_method is None: + data = self + else: + data = self.fillna(method=fill_method, limit=limit) + rs = data / data.shift(periods=periods, freq=freq, **kwds) - 1 + if freq is None: + mask = com.isnull(self.values) + np.putmask(rs.values, mask, np.nan) + return rs + + +class NDFrame(PandasObject): + """ + N-dimensional analogue of DataFrame. Store multi-dimensional in a + size-mutable, labeled data structure + + Parameters + ---------- + data : BlockManager + axes : list + copy : boolean, default False + """ + # kludge + _default_stat_axis = 0 + + def __init__(self, data, axes=None, copy=False, dtype=None): + if dtype is not None: + data = data.astype(dtype) + elif copy: + data = data.copy() + + if axes is not None: + for i, ax in enumerate(axes): + data = data.reindex_axis(ax, axis=i) + + self._data = data + self._item_cache = {} + + def astype(self, dtype): + """ + Cast object to input numpy.dtype + + Parameters + ---------- + dtype : numpy.dtype or Python type + + Returns + ------- + casted : type of caller + """ + return self._constructor(self._data, dtype=dtype) + + @property + def _constructor(self): + return NDFrame + + @property + def axes(self): + return self._data.axes + + def __repr__(self): + return 'NDFrame' + + @property + def values(self): + return self._data.as_matrix() + + @property + def ndim(self): + return self._data.ndim + + def _set_axis(self, axis, labels): + self._data.set_axis(axis, labels) + self._clear_item_cache() + + def __getitem__(self, item): + return self._get_item_cache(item) + + def _get_item_cache(self, item): + cache = self._item_cache + try: + return cache[item] + except Exception: + values = self._data.get(item) + res = self._box_item_values(item, values) + cache[item] = res + return res + + def _box_item_values(self, key, values): + raise NotImplementedError + + def _clear_item_cache(self): + self._item_cache.clear() + + def _set_item(self, key, value): + if hasattr(self,'columns') and isinstance(self.columns, MultiIndex): + # Pad the key with empty strings if lower levels of the key + # aren't specified: + if not isinstance(key, tuple): + key = (key,) + if len(key) != self.columns.nlevels: + key += ('',)*(self.columns.nlevels - len(key)) + self._data.set(key, value) + + try: + del self._item_cache[key] + except KeyError: + pass + + def __delitem__(self, key): + """ + Delete item + """ + deleted = False + + maybe_shortcut = False + if hasattr(self,'columns') and isinstance(self.columns, MultiIndex): + try: + maybe_shortcut = key not in self.columns._engine + except TypeError: + pass + + if maybe_shortcut: + # Allow shorthand to delete all columns whose first len(key) + # elements match key: + if not isinstance(key,tuple): + key = (key,) + for col in self.columns: + if isinstance(col,tuple) and col[:len(key)] == key: + del self[col] + deleted = True + if not deleted: + # If the above loop ran and didn't delete anything because + # there was no match, this call should raise the appropriate + # exception: + self._data.delete(key) + + try: + del self._item_cache[key] + except KeyError: + pass + + def pop(self, item): + """ + Return item and drop from frame. Raise KeyError if not found. + """ + result = self[item] + del self[item] + return result + + def _expand_axes(self, key): + new_axes = [] + for k, ax in zip(key, self.axes): + if k not in ax: + if type(k) != ax.dtype.type: + ax = ax.astype('O') + new_axes.append(ax.insert(len(ax), k)) + else: + new_axes.append(ax) + + return new_axes + + #---------------------------------------------------------------------- + # Consolidation of internals + + def _consolidate_inplace(self): + self._clear_item_cache() + self._data = self._data.consolidate() + + def consolidate(self, inplace=False): + """ + Compute NDFrame with "consolidated" internals (data of each dtype + grouped together in a single ndarray). Mainly an internal API function, + but available here to the savvy user + + Parameters + ---------- + inplace : boolean, default False + If False return new object, otherwise modify existing object + + Returns + ------- + consolidated : type of caller + """ + if inplace: + self._consolidate_inplace() + return self + else: + cons_data = self._data.consolidate() + if cons_data is self._data: + cons_data = cons_data.copy() + return self._constructor(cons_data) + + @property + def _is_mixed_type(self): + self._consolidate_inplace() + return len(self._data.blocks) > 1 + + def _reindex_axis(self, new_index, fill_method, axis, copy): + new_data = self._data.reindex_axis(new_index, axis=axis, + method=fill_method, copy=copy) + + if new_data is self._data and not copy: + return self + else: + return self._constructor(new_data) + + def cumsum(self, axis=None, skipna=True): + """ + Return DataFrame of cumulative sums over requested axis. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + + Returns + ------- + y : DataFrame + """ + if axis is None: + axis = self._default_stat_axis + else: + axis = self._get_axis_number(axis) + + y = self.values.copy() + if not issubclass(y.dtype.type, np.integer): + mask = np.isnan(self.values) + + if skipna: + np.putmask(y, mask, 0.) + + result = y.cumsum(axis) + + if skipna: + np.putmask(result, mask, np.nan) + else: + result = y.cumsum(axis) + return self._wrap_array(result, self.axes, copy=False) + + def _wrap_array(self, array, axes, copy=False): + raise NotImplementedError + + def cumprod(self, axis=None, skipna=True): + """ + Return cumulative product over requested axis as DataFrame + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + + Returns + ------- + y : DataFrame + """ + if axis is None: + axis = self._default_stat_axis + else: + axis = self._get_axis_number(axis) + + y = self.values.copy() + if not issubclass(y.dtype.type, np.integer): + mask = np.isnan(self.values) + + if skipna: + np.putmask(y, mask, 1.) + result = y.cumprod(axis) + + if skipna: + np.putmask(result, mask, np.nan) + else: + result = y.cumprod(axis) + return self._wrap_array(result, self.axes, copy=False) + + def cummax(self, axis=None, skipna=True): + """ + Return DataFrame of cumulative max over requested axis. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + + Returns + ------- + y : DataFrame + """ + if axis is None: + axis = self._default_stat_axis + else: + axis = self._get_axis_number(axis) + + y = self.values.copy() + if not issubclass(y.dtype.type, np.integer): + mask = np.isnan(self.values) + + if skipna: + np.putmask(y, mask, -np.inf) + + result = np.maximum.accumulate(y, axis) + + if skipna: + np.putmask(result, mask, np.nan) + else: + result = np.maximum.accumulate(y,axis) + return self._wrap_array(result, self.axes, copy=False) + + def cummin(self, axis=None, skipna=True): + """ + Return DataFrame of cumulative min over requested axis. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + + Returns + ------- + y : DataFrame + """ + if axis is None: + axis = self._default_stat_axis + else: + axis = self._get_axis_number(axis) + + y = self.values.copy() + if not issubclass(y.dtype.type, np.integer): + mask = np.isnan(self.values) + + if skipna: + np.putmask(y, mask, np.inf) + + result = np.minimum.accumulate(y, axis) + + if skipna: + np.putmask(result, mask, np.nan) + else: + result = np.minimum.accumulate(y,axis) + return self._wrap_array(result, self.axes, copy=False) + + def copy(self, deep=True): + """ + Make a copy of this object + + Parameters + ---------- + deep : boolean, default True + Make a deep copy, i.e. also copy data + + Returns + ------- + copy : type of caller + """ + data = self._data + if deep: + data = data.copy() + return self._constructor(data) + + def swaplevel(self, i, j, axis=0): + """ + Swap levels i and j in a MultiIndex on a particular axis + + Returns + ------- + swapped : type of caller (new object) + """ + axis = self._get_axis_number(axis) + result = self.copy() + labels = result._data.axes[axis] + result._data.set_axis(axis, labels.swaplevel(i, j)) + return result + + def add_prefix(self, prefix): + """ + Concatenate prefix string with panel items names. + + Parameters + ---------- + prefix : string + + Returns + ------- + with_prefix : type of caller + """ + new_data = self._data.add_prefix(prefix) + return self._constructor(new_data) + + def add_suffix(self, suffix): + """ + Concatenate suffix string with panel items names + + Parameters + ---------- + suffix : string + + Returns + ------- + with_suffix : type of caller + """ + new_data = self._data.add_suffix(suffix) + return self._constructor(new_data) + + def rename_axis(self, mapper, axis=0, copy=True): + """ + Alter index and / or columns using input function or functions. + Function / dict values must be unique (1-to-1). Labels not contained in + a dict / Series will be left as-is. + + Parameters + ---------- + mapper : dict-like or function, optional + axis : int, default 0 + copy : boolean, default True + Also copy underlying data + + See also + -------- + DataFrame.rename + + Returns + ------- + renamed : type of caller + """ + # should move this at some point + from pandas.core.series import _get_rename_function + + mapper_f = _get_rename_function(mapper) + + if axis == 0: + new_data = self._data.rename_items(mapper_f, copydata=copy) + else: + new_data = self._data.rename_axis(mapper_f, axis=axis) + if copy: + new_data = new_data.copy() + + return self._constructor(new_data) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + + Parameters + ---------- + indices : list / array of ints + axis : int, default 0 + + Returns + ------- + taken : type of caller + """ + if axis == 0: + labels = self._get_axis(axis) + new_items = labels.take(indices) + new_data = self._data.reindex_axis(new_items, axis=0) + else: + new_data = self._data.take(indices, axis=axis) + return self._constructor(new_data) + + def tz_convert(self, tz, axis=0, copy=True): + """ + Convert TimeSeries to target time zone. If it is time zone naive, it + will be localized to the passed time zone. + + Parameters + ---------- + tz : string or pytz.timezone object + copy : boolean, default True + Also make a copy of the underlying data + + Returns + ------- + """ + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + if not hasattr(ax, 'tz_convert'): + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + + new_data = self._data + if copy: + new_data = new_data.copy() + + new_obj = self._constructor(new_data) + new_ax = ax.tz_convert(tz) + + if axis == 0: + new_obj._set_axis(1, new_ax) + elif axis == 1: + new_obj._set_axis(0, new_ax) + self._clear_item_cache() + + return new_obj + + def tz_localize(self, tz, axis=0, copy=True): + """ + Localize tz-naive TimeSeries to target time zone + + Parameters + ---------- + tz : string or pytz.timezone object + copy : boolean, default True + Also make a copy of the underlying data + + Returns + ------- + """ + axis = self._get_axis_number(axis) + ax = self._get_axis(axis) + + if not hasattr(ax, 'tz_localize'): + ax_name = self._get_axis_name(axis) + raise TypeError('%s is not a valid DatetimeIndex or PeriodIndex' % + ax_name) + + new_data = self._data + if copy: + new_data = new_data.copy() + + new_obj = self._constructor(new_data) + new_ax = ax.tz_localize(tz) + + if axis == 0: + new_obj._set_axis(1, new_ax) + elif axis == 1: + new_obj._set_axis(0, new_ax) + self._clear_item_cache() + + return new_obj + +# Good for either Series or DataFrame + +def truncate(self, before=None, after=None, copy=True): + """Function truncate a sorted DataFrame / Series before and/or after + some particular dates. + + Parameters + ---------- + before : date + Truncate before date + after : date + Truncate after date + + Returns + ------- + truncated : type of caller + """ + from pandas.tseries.tools import to_datetime + before = to_datetime(before) + after = to_datetime(after) + + if before is not None and after is not None: + assert(before <= after) + + result = self.ix[before:after] + + if isinstance(self.index, MultiIndex): + result.index = self.index.truncate(before, after) + + if copy: + result = result.copy() + + return result + diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py new file mode 100644 index 00000000..c5889692 --- /dev/null +++ b/pandas/core/groupby.py @@ -0,0 +1,2171 @@ +from itertools import izip +import types +import numpy as np + +from pandas.core.algorithms import unique +from pandas.core.categorical import Factor +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.internals import BlockManager, make_block +from pandas.core.series import Series +from pandas.core.panel import Panel +from pandas.util.decorators import cache_readonly, Appender +import pandas.core.algorithms as algos +import pandas.core.common as com +import pandas.lib as lib + + +class GroupByError(Exception): + pass + + +def _groupby_function(name, alias, npfunc): + def f(self): + try: + return self._cython_agg_general(alias) + except Exception: + return self.aggregate(lambda x: npfunc(x, axis=self.axis)) + + f.__doc__ = "Compute %s of group values" % name + f.__name__ = name + + return f + +def _first_compat(x, axis=0): + x = np.asarray(x) + x = x[com.notnull(x)] + if len(x) == 0: + return np.nan + return x[0] + +def _last_compat(x, axis=0): + x = np.asarray(x) + x = x[com.notnull(x)] + if len(x) == 0: + return np.nan + return x[-1] + + +class GroupBy(object): + """ + Class for grouping and aggregating relational data. See aggregate, + transform, and apply functions on this object. + + It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: + + :: + + grouped = groupby(obj, ...) + + Parameters + ---------- + obj : pandas object + axis : int, default 0 + level : int, default None + Level of MultiIndex + groupings : list of Grouping objects + Most users should ignore this + exclusions : array-like, optional + List of columns to exclude + name : string + Most users should ignore this + + Notes + ----- + After grouping, see aggregate, apply, and transform functions. Here are + some other brief notes about usage. When grouping by multiple groups, the + result index will be a MultiIndex (hierarhical) by default. + + Iteration produces (key, group) tuples, i.e. chunking the data by group. So + you can write code like: + + :: + + grouped = obj.groupby(keys, axis=axis) + for key, group in grouped: + # do something with the data + + Function calls on GroupBy, if not specially implemented, "dispatch" to the + grouped data. So if you group a DataFrame and wish to invoke the std() + method on each group, you can simply do: + + :: + + df.groupby(mapper).std() + + rather than + + :: + + df.groupby(mapper).aggregate(np.std) + + You can pass arguments to these "wrapped" functions, too. + + See the online documentation for full exposition on these topics and much + more + + Returns + ------- + **Attributes** + groups : dict + {group name -> group labels} + len(grouped) : int + Number of groups + """ + + def __init__(self, obj, keys=None, axis=0, level=None, + grouper=None, exclusions=None, selection=None, as_index=True, + sort=True, group_keys=True): + self._selection = selection + + if isinstance(obj, NDFrame): + obj._consolidate_inplace() + + self.obj = obj + self.axis = axis + self.level = level + + if not as_index: + if not isinstance(obj, DataFrame): + raise TypeError('as_index=False only valid with DataFrame') + if axis != 0: + raise ValueError('as_index=False only valid for axis=0') + + self.as_index = as_index + self.keys = keys + self.sort = sort + self.group_keys = group_keys + + if grouper is None: + grouper, exclusions = _get_grouper(obj, keys, axis=axis, + level=level, sort=sort) + + self.grouper = grouper + self.exclusions = set(exclusions) if exclusions else set() + + def __len__(self): + return len(self.indices) + + @property + def groups(self): + return self.grouper.groups + + @property + def ngroups(self): + return self.grouper.ngroups + + @property + def indices(self): + return self.grouper.indices + + @property + def name(self): + if self._selection is None: + return None # 'result' + else: + return self._selection + + @property + def _selection_list(self): + if not isinstance(self._selection, (list, tuple, np.ndarray)): + return [self._selection] + return self._selection + + def __getattr__(self, attr): + if attr in self.obj: + return self[attr] + + if hasattr(self.obj, attr) and attr != '_cache': + return self._make_wrapper(attr) + + raise AttributeError("'%s' object has no attribute '%s'" % + (type(self).__name__, attr)) + + def __getitem__(self, key): + raise NotImplementedError + + def _make_wrapper(self, name): + f = getattr(self.obj, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) + + f = getattr(type(self.obj), name) + + def wrapper(*args, **kwargs): + # a little trickery for aggregation functions that need an axis + # argument + kwargs_with_axis = kwargs.copy() + if 'axis' not in kwargs_with_axis: + kwargs_with_axis['axis'] = self.axis + + def curried_with_axis(x): + return f(x, *args, **kwargs_with_axis) + def curried(x): + return f(x, *args, **kwargs) + + try: + return self.apply(curried_with_axis) + except Exception: + return self.apply(curried) + + return wrapper + + def get_group(self, name, obj=None): + if obj is None: + obj = self.obj + + inds = self.indices[name] + return obj.take(inds, axis=self.axis) + + def __iter__(self): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + return self.grouper.get_iterator(self.obj, axis=self.axis) + + def apply(self, func, *args, **kwargs): + """ + Apply function and combine results together in an intelligent way. The + split-apply-combine combination rules attempt to be as common sense + based as possible. For example: + + case 1: + group DataFrame + apply aggregation function (f(chunk) -> Series) + yield DataFrame, with group axis having group labels + + case 2: + group DataFrame + apply transform function ((f(chunk) -> DataFrame with same indexes) + yield DataFrame with resulting chunks glued together + + case 3: + group Series + apply function with f(chunk) -> DataFrame + yield DataFrame with result of chunks glued together + + Parameters + ---------- + func : function + + Notes + ----- + See online documentation for full exposition on how to use apply + + See also + -------- + aggregate, transform + + Returns + ------- + applied : type depending on grouped object and function + """ + return self._python_apply_general(func, *args, **kwargs) + + def aggregate(self, func, *args, **kwargs): + raise NotImplementedError + + def agg(self, func, *args, **kwargs): + """ + See docstring for aggregate + """ + return self.aggregate(func, *args, **kwargs) + + def _iterate_slices(self): + yield self.name, self.obj + + def transform(self, func, *args, **kwargs): + raise NotImplementedError + + def mean(self): + """ + Compute mean of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + try: + return self._cython_agg_general('mean') + except GroupByError: + raise + except Exception: # pragma: no cover + f = lambda x: x.mean(axis=self.axis) + return self._python_agg_general(f) + + def std(self, ddof=1): + """ + Compute standard deviation of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + # todo, implement at cython level? + if ddof == 1: + return self._cython_agg_general('std') + else: + f = lambda x: x.std(ddof=ddof) + return self._python_agg_general(f) + + def var(self, ddof=1): + """ + Compute variance of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + if ddof == 1: + return self._cython_agg_general('var') + else: + f = lambda x: x.var(ddof=ddof) + return self._python_agg_general(f) + + def size(self): + """ + Compute group sizes + """ + return self.grouper.size() + + sum = _groupby_function('sum', 'add', np.sum) + prod = _groupby_function('prod', 'prod', np.prod) + min = _groupby_function('min', 'min', np.min) + max = _groupby_function('max', 'max', np.max) + first = _groupby_function('first', 'first', _first_compat) + last = _groupby_function('last', 'last', _last_compat) + + def ohlc(self): + """ + Compute sum of values, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + return self._cython_agg_general('ohlc') + + def nth(self, n): + def picker(arr): + arr = arr[com.notnull(arr)] + if len(arr) >= n + 1: + return arr.iget(n) + else: + return np.nan + return self.agg(picker) + + def _cython_agg_general(self, how): + output = {} + for name, obj in self._iterate_slices(): + if not issubclass(obj.dtype.type, (np.number, np.bool_)): + continue + + result, names = self.grouper.aggregate(obj.values, how) + output[name] = result + + if len(output) == 0: + raise GroupByError('No numeric types to aggregate') + + return self._wrap_aggregated_output(output, names) + + def _python_agg_general(self, func, *args, **kwargs): + func = _intercept_function(func) + agg_func = lambda x: func(x, *args, **kwargs) + + # iterate through "columns" ex exclusions to populate output dict + output = {} + for name, obj in self._iterate_slices(): + try: + result, counts = self.grouper.agg_series(obj, agg_func) + output[name] = result + except TypeError: + continue + + if len(output) == 0: + return self._python_apply_general(func, *args, **kwargs) + + mask = counts.ravel() > 0 + for name, result in output.iteritems(): + output[name] = result[mask] + + return self._wrap_aggregated_output(output) + + def _python_apply_general(self, func, *args, **kwargs): + func = _intercept_function(func) + + result_keys = [] + result_values = [] + + not_indexed_same = False + for key, group in self: + object.__setattr__(group, 'name', key) + + # group might be modified + group_axes = _get_axes(group) + + res = func(group, *args, **kwargs) + + if not _is_indexed_like(res, group_axes): + not_indexed_same = True + + result_keys.append(key) + result_values.append(res) + + return self._wrap_applied_output(result_keys, result_values, + not_indexed_same=not_indexed_same) + + def _wrap_applied_output(self, *args, **kwargs): + raise NotImplementedError + + def _concat_objects(self, keys, values, not_indexed_same=False): + from pandas.tools.merge import concat + + if not not_indexed_same: + result = concat(values, axis=self.axis) + ax = self.obj._get_axis(self.axis) + + if isinstance(result, Series): + result = result.reindex(ax) + else: + result = result.reindex_axis(ax, axis=self.axis) + elif self.group_keys: + group_keys = keys + group_levels = self.grouper.levels + group_names = self.grouper.names + result = concat(values, axis=self.axis, keys=group_keys, + levels=group_levels, names=group_names) + else: + result = concat(values, axis=self.axis) + + return result + +def _generate_groups(obj, group_index, ngroups, axis=0): + if isinstance(obj, NDFrame) and not isinstance(obj, DataFrame): + factory = obj._constructor + obj = obj._data + else: + factory = None + + return generate_groups(obj, group_index, ngroups, + axis=axis, factory=factory) + +@Appender(GroupBy.__doc__) +def groupby(obj, by, **kwds): + if isinstance(obj, Series): + klass = SeriesGroupBy + elif isinstance(obj, DataFrame): + klass = DataFrameGroupBy + else: # pragma: no cover + raise TypeError('invalid type: %s' % type(obj)) + + return klass(obj, by, **kwds) + +def _get_axes(group): + if isinstance(group, Series): + return [group.index] + else: + return group.axes + +def _is_indexed_like(obj, axes): + if isinstance(obj, Series): + if len(axes) > 1: + return False + return obj.index.equals(axes[0]) + elif isinstance(obj, DataFrame): + return obj.index.equals(axes[0]) + + return False + +class Grouper(object): + """ + + """ + def __init__(self, axis, groupings, sort=True, group_keys=True): + self.axis = axis + self.groupings = groupings + self.sort = sort + self.group_keys = group_keys + + @property + def shape(self): + return tuple(ping.ngroups for ping in self.groupings) + + def __iter__(self): + return iter(self.indices) + + @property + def nkeys(self): + return len(self.groupings) + + def get_iterator(self, data, axis=0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + if len(self.groupings) == 1: + indices = self.indices + groups = indices.keys() + try: + groups = sorted(groups) + except Exception: # pragma: no cover + pass + + for name in groups: + inds = indices[name] + group = data.take(inds, axis=axis) + yield name, group + else: + # provide "flattened" iterator for multi-group setting + comp_ids, _, ngroups = self.group_info + label_list = self.labels + level_list = self.levels + mapper = _KeyMapper(comp_ids, ngroups, label_list, level_list) + + for label, group in _generate_groups(data, comp_ids, ngroups, + axis=axis): + key = mapper.get_key(label) + yield key, group + + @cache_readonly + def indices(self): + if len(self.groupings) == 1: + return self.groupings[0].indices + else: + label_list = [ping.labels for ping in self.groupings] + keys = [ping.group_index for ping in self.groupings] + return _get_indices_dict(label_list, keys) + + @property + def labels(self): + return [ping.labels for ping in self.groupings] + + @property + def levels(self): + return [ping.group_index for ping in self.groupings] + + @property + def names(self): + return [ping.name for ping in self.groupings] + + def size(self): + """ + Compute group sizes + """ + # TODO: better impl + labels, _, ngroups = self.group_info + bin_counts = Series(labels).value_counts() + bin_counts = bin_counts.reindex(np.arange(ngroups)) + bin_counts.index = self.result_index + return bin_counts + + @cache_readonly + def groups(self): + if len(self.groupings) == 1: + return self.groupings[0].groups + else: + to_groupby = zip(*(ping.grouper for ping in self.groupings)) + to_groupby = Index(to_groupby) + + return self.axis.groupby(to_groupby) + + @cache_readonly + def group_info(self): + comp_ids, obs_group_ids = self._get_compressed_labels() + + ngroups = len(obs_group_ids) + comp_ids = com._ensure_int64(comp_ids) + return comp_ids, obs_group_ids, ngroups + + def _get_compressed_labels(self): + all_labels = [ping.labels for ping in self.groupings] + if self._overflow_possible: + tups = lib.fast_zip(all_labels) + labs, uniques, _ = algos.factorize(tups) + + if self.sort: + uniques, labs = _reorder_by_uniques(uniques, labs) + + return labs, uniques + else: + if len(all_labels) > 1: + group_index = get_group_index(all_labels, self.shape) + else: + group_index = all_labels[0] + comp_ids, obs_group_ids = _compress_group_index(group_index) + return comp_ids, obs_group_ids + + @cache_readonly + def _overflow_possible(self): + return _int64_overflow_possible(self.shape) + + @cache_readonly + def ngroups(self): + return len(self.result_index) + + @cache_readonly + def result_index(self): + recons = self.get_group_levels() + return MultiIndex.from_arrays(recons, names=self.names) + + def get_group_levels(self): + obs_ids = self.group_info[1] + if self._overflow_possible: + recons_labels = [np.array(x) for x in izip(*obs_ids)] + else: + recons_labels = decons_group_index(obs_ids, self.shape) + + name_list = [] + for ping, labels in zip(self.groupings, recons_labels): + labels = com._ensure_platform_int(labels) + name_list.append(ping.group_index.take(labels)) + + return name_list + + #------------------------------------------------------------ + # Aggregation functions + + _cython_functions = { + 'add' : lib.group_add, + 'prod' : lib.group_prod, + 'min' : lib.group_min, + 'max' : lib.group_max, + 'mean' : lib.group_mean, + 'var' : lib.group_var, + 'std' : lib.group_var, + 'first': lambda a, b, c, d: lib.group_nth(a, b, c, d, 1), + 'last': lib.group_last + } + + _cython_transforms = { + 'std' : np.sqrt + } + + _cython_arity = { + 'ohlc' : 4, # OHLC + } + + _name_functions = {} + + _filter_empty_groups = True + + def aggregate(self, values, how, axis=0): + values = com._ensure_float64(values) + arity = self._cython_arity.get(how, 1) + + vdim = values.ndim + swapped = False + if vdim == 1: + values = values[:, None] + out_shape = (self.ngroups, arity) + else: + if axis > 0: + swapped = True + values = values.swapaxes(0, axis) + if arity > 1: + raise NotImplementedError + out_shape = (self.ngroups,) + values.shape[1:] + + # will be filled in Cython function + result = np.empty(out_shape, dtype=np.float64) + counts = np.zeros(self.ngroups, dtype=np.int64) + + result = self._aggregate(result, counts, values, how) + + if self._filter_empty_groups: + if result.ndim == 2: + result = lib.row_bool_subset(result, + (counts > 0).view(np.uint8)) + else: + result = result[counts > 0] + + if vdim == 1 and arity == 1: + result = result[:, 0] + + if how in self._name_functions: + # TODO + names = self._name_functions[how]() + else: + names = None + + if swapped: + result = result.swapaxes(0, axis) + + return result, names + + def _aggregate(self, result, counts, values, how): + agg_func = self._cython_functions[how] + trans_func = self._cython_transforms.get(how, lambda x: x) + + comp_ids, _, ngroups = self.group_info + if values.ndim > 3: + # punting for now + raise NotImplementedError + elif values.ndim > 2: + for i, chunk in enumerate(values.transpose(2, 0, 1)): + agg_func(result[:, :, i], counts, chunk.squeeze(), + comp_ids) + else: + agg_func(result, counts, values, comp_ids) + + return trans_func(result) + + def agg_series(self, obj, func): + try: + return self._aggregate_series_fast(obj, func) + except Exception: + return self._aggregate_series_pure_python(obj, func) + + def _aggregate_series_fast(self, obj, func): + func = _intercept_function(func) + + if obj.index._has_complex_internals: + raise TypeError('Incompatible index for Cython grouper') + + group_index, _, ngroups = self.group_info + + # avoids object / Series creation overhead + dummy = obj[:0].copy() + indexer = lib.groupsort_indexer(group_index, ngroups)[0] + obj = obj.take(indexer) + group_index = com.ndtake(group_index, indexer) + grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, + dummy) + result, counts = grouper.get_result() + return result, counts + + def _aggregate_series_pure_python(self, obj, func): + group_index, _, ngroups = self.group_info + + counts = np.zeros(ngroups, dtype=int) + result = None + + group_index, _, ngroups = self.group_info + + for label, group in _generate_groups(obj, group_index, ngroups, + axis=self.axis): + res = func(group) + if result is None: + try: + assert(not isinstance(res, np.ndarray)) + assert(not isinstance(res, list)) + result = np.empty(ngroups, dtype='O') + except Exception: + raise ValueError('function does not reduce') + + counts[label] = group.shape[0] + result[label] = res + + result = lib.maybe_convert_objects(result, try_float=0) + return result, counts + + +def generate_bins_generic(values, binner, closed): + """ + Generate bin edge offsets and bin labels for one array using another array + which has bin edge values. Both arrays must be sorted. + + Parameters + ---------- + values : array of values + binner : a comparable array of values representing bins into which to bin + the first array. Note, 'values' end-points must fall within 'binner' + end-points. + closed : which end of bin is closed; left (default), right + + Returns + ------- + bins : array of offsets (into 'values' argument) of bins. + Zero and last edge are excluded in result, so for instance the first + bin is values[0:bin[0]] and the last is values[bin[-1]:] + """ + lenidx = len(values) + lenbin = len(binner) + + if lenidx <= 0 or lenbin <= 0: + raise ValueError("Invalid length for values or for binner") + + # check binner fits data + if values[0] < binner[0]: + raise ValueError("Values falls before first bin") + + if values[lenidx-1] > binner[lenbin-1]: + raise ValueError("Values falls after last bin") + + bins = np.empty(lenbin - 1, dtype=np.int64) + + j = 0 # index into values + bc = 0 # bin count + + # linear scan, presume nothing about values/binner except that it + # fits ok + for i in range(0, lenbin-1): + r_bin = binner[i+1] + + # count values in current bin, advance to next bin + while j < lenidx and (values[j] < r_bin or + (closed == 'right' and values[j] == r_bin)): + j += 1 + + bins[bc] = j + bc += 1 + + return bins + + +class CustomGrouper(object): + + def get_grouper(self, obj): + raise NotImplementedError + + +class BinGrouper(Grouper): + + def __init__(self, bins, binlabels, filter_empty=False): + self.bins = com._ensure_int64(bins) + self.binlabels = _ensure_index(binlabels) + self._filter_empty_groups = filter_empty + + @property + def nkeys(self): + return 1 + + def get_iterator(self, data, axis=0): + """ + Groupby iterator + + Returns + ------- + Generator yielding sequence of (name, subsetted object) + for each group + """ + if axis == 1: + raise NotImplementedError + + start = 0 + for edge, label in zip(self.bins, self.binlabels): + yield label, data[start:edge] + start = edge + + if edge < len(data): + yield self.binlabels[-1], data[edge:] + + @cache_readonly + def ngroups(self): + return len(self.binlabels) + + @cache_readonly + def result_index(self): + return self.binlabels + + @property + def levels(self): + return [self.binlabels] + + @property + def names(self): + return [self.binlabels.name] + + #---------------------------------------------------------------------- + # cython aggregation + + _cython_functions = { + 'add' : lib.group_add_bin, + 'prod' : lib.group_prod_bin, + 'mean' : lib.group_mean_bin, + 'min' : lib.group_min_bin, + 'max' : lib.group_max_bin, + 'var' : lib.group_var_bin, + 'std' : lib.group_var_bin, + 'ohlc' : lib.group_ohlc, + 'first': lambda a, b, c, d: lib.group_nth_bin(a, b, c, d, 1), + 'last': lib.group_last_bin + } + + _name_functions = { + 'ohlc' : lambda *args: ['open', 'high', 'low', 'close'] + } + + _filter_empty_groups = True + + def _aggregate(self, result, counts, values, how): + agg_func = self._cython_functions[how] + trans_func = self._cython_transforms.get(how, lambda x: x) + + if values.ndim > 3: + # punting for now + raise NotImplementedError + elif values.ndim > 2: + for i, chunk in enumerate(values.transpose(2, 0, 1)): + agg_func(result[:, :, i], counts, chunk, self.bins) + else: + agg_func(result, counts, values, self.bins) + + return trans_func(result) + + def agg_series(self, obj, func): + dummy = obj[:0] + grouper = lib.SeriesBinGrouper(obj, func, self.bins, dummy) + return grouper.get_result() + + +class Grouping(object): + """ + Holds the grouping information for a single key + + Parameters + ---------- + index : Index + grouper : + name : + level : + + Returns + ------- + **Attributes**: + * indices : dict of {group -> index_list} + * labels : ndarray, group labels + * ids : mapping of label -> group + * counts : array of group counts + * group_index : unique groups + * groups : dict of {group -> label_list} + """ + def __init__(self, index, grouper=None, name=None, level=None, + sort=True): + + self.name = name + self.level = level + self.grouper = _convert_grouper(index, grouper) + self.index = index + self.sort = sort + + # right place for this? + if isinstance(grouper, (Series, Index)) and name is None: + self.name = grouper.name + + # pre-computed + self._was_factor = False + + if level is not None: + if not isinstance(level, int): + assert(level in index.names) + level = index.names.index(level) + + inds = index.labels[level] + level_index = index.levels[level] + + if self.name is None: + self.name = index.names[level] + + # XXX complete hack + + level_values = index.levels[level].take(inds) + if grouper is not None: + self.grouper = level_values.map(self.grouper) + else: + self._was_factor = True + self._labels = inds + self._group_index = level_index + self.grouper = level_values + else: + if isinstance(self.grouper, (list, tuple)): + self.grouper = com._asarray_tuplesafe(self.grouper) + elif isinstance(self.grouper, Factor): + factor = self.grouper + self._was_factor = True + + # Is there any way to avoid this? + self.grouper = np.asarray(factor) + + self._labels = factor.labels + self._group_index = factor.levels + if self.name is None: + self.name = factor.name + + # no level passed + if not isinstance(self.grouper, np.ndarray): + self.grouper = self.index.map(self.grouper) + + def __repr__(self): + return 'Grouping(%s)' % self.name + + def __iter__(self): + return iter(self.indices) + + _labels = None + _counts = None + _group_index = None + + @property + def ngroups(self): + return len(self.group_index) + + @cache_readonly + def indices(self): + return _groupby_indices(self.grouper) + + @property + def labels(self): + if self._labels is None: + self._make_labels() + return self._labels + + @property + def counts(self): + if self._counts is None: + if self._was_factor: + self._counts = lib.group_count(com._ensure_int64(self.labels), + self.ngroups) + else: + self._make_labels() + return self._counts + + @property + def group_index(self): + if self._group_index is None: + self._make_labels() + return self._group_index + + def _make_labels(self): + if self._was_factor: # pragma: no cover + raise Exception('Should not call this method grouping by level') + else: + labs, uniques, counts = algos.factorize(self.grouper, + sort=self.sort) + uniques = Index(uniques, name=self.name) + self._labels = labs + self._group_index = uniques + self._counts = counts + + _groups = None + @property + def groups(self): + if self._groups is None: + self._groups = self.index.groupby(self.grouper) + return self._groups + + +def _get_grouper(obj, key=None, axis=0, level=None, sort=True): + group_axis = obj._get_axis(axis) + + if level is not None: + if not isinstance(group_axis, MultiIndex): + if level > 0: + raise ValueError('level > 0 only valid with MultiIndex') + else: + level = None + key = group_axis + + if isinstance(key, CustomGrouper): + gpr = key.get_grouper(obj) + return gpr, [] + elif isinstance(key, Grouper): + return key, [] + + if not isinstance(key, (tuple, list)): + keys = [key] + else: + keys = key + + # what are we after, exactly? + match_axis_length = len(keys) == len(group_axis) + any_callable = any(callable(g) or isinstance(g, dict) for g in keys) + any_arraylike = any(isinstance(g, (list, tuple, np.ndarray)) + for g in keys) + + try: + if isinstance(obj, DataFrame): + all_in_columns = all(g in obj.columns for g in keys) + else: + all_in_columns = False + except Exception: + all_in_columns = False + + if (not any_callable and not all_in_columns + and not any_arraylike and match_axis_length + and not level): + keys = [com._asarray_tuplesafe(keys)] + + if isinstance(level, (tuple, list)): + if key is None: + keys = [None] * len(level) + levels = level + else: + levels = [level] * len(keys) + + groupings = [] + exclusions = [] + for i, (gpr, level) in enumerate(zip(keys, levels)): + name = None + try: + obj._data.items.get_loc(gpr) + in_axis = True + except Exception: + in_axis = False + + if _is_label_like(gpr) or in_axis: + exclusions.append(gpr) + name = gpr + gpr = obj[gpr] + ping = Grouping(group_axis, gpr, name=name, level=level, sort=sort) + groupings.append(ping) + + if len(groupings) == 0: + raise ValueError('No group keys passed!') + + grouper = Grouper(group_axis, groupings, sort=sort) + + return grouper, exclusions + +def _is_label_like(val): + return isinstance(val, basestring) or np.isscalar(val) + +def _convert_grouper(axis, grouper): + if isinstance(grouper, dict): + return grouper.get + elif isinstance(grouper, Series): + if grouper.index.equals(axis): + return grouper.values + else: + return grouper.reindex(axis).values + elif isinstance(grouper, (list, np.ndarray)): + assert(len(grouper) == len(axis)) + return grouper + else: + return grouper + +class SeriesGroupBy(GroupBy): + + def aggregate(self, func_or_funcs, *args, **kwargs): + """ + Apply aggregation function or functions to groups, yielding most likely + Series but in some cases DataFrame depending on the output of the + aggregation function + + Parameters + ---------- + func_or_funcs : function or list / dict of functions + List/dict of functions will produce DataFrame with column names + determined by the function names themselves (list) or the keys in + the dict + + Notes + ----- + agg is an alias for aggregate. Use it. + + Example + ------- + >>> series + bar 1.0 + baz 2.0 + qot 3.0 + qux 4.0 + + >>> mapper = lambda x: x[0] # first letter + >>> grouped = series.groupby(mapper) + + >>> grouped.aggregate(np.sum) + b 3.0 + q 7.0 + + >>> grouped.aggregate([np.sum, np.mean, np.std]) + mean std sum + b 1.5 0.5 3 + q 3.5 0.5 7 + + >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), + ... 'total' : np.sum}) + result total + b 2.121 3 + q 4.95 7 + + See also + -------- + apply, transform + + Returns + ------- + Series or DataFrame + """ + if isinstance(func_or_funcs, basestring): + return getattr(self, func_or_funcs)(*args, **kwargs) + + if hasattr(func_or_funcs,'__iter__'): + ret = self._aggregate_multiple_funcs(func_or_funcs) + else: + cyfunc = _intercept_cython(func_or_funcs) + if cyfunc and not args and not kwargs: + return getattr(self, cyfunc)() + + if self.grouper.nkeys > 1: + return self._python_agg_general(func_or_funcs, *args, **kwargs) + + try: + return self._python_agg_general(func_or_funcs, *args, **kwargs) + except Exception: + result = self._aggregate_named(func_or_funcs, *args, **kwargs) + + index = Index(sorted(result), name=self.grouper.names[0]) + ret = Series(result, index=index) + + if not self.as_index: # pragma: no cover + print 'Warning, ignoring as_index=True' + + return ret + + def _aggregate_multiple_funcs(self, arg): + if isinstance(arg, dict): + columns = arg.keys() + arg = arg.items() + elif any(isinstance(x, (tuple, list)) for x in arg): + arg = [(x, x) if not isinstance(x, (tuple, list)) else x + for x in arg] + + # indicated column order + columns = list(zip(*arg))[0] + else: + # list of functions / function names + columns = [] + for f in arg: + if isinstance(f, basestring): + columns.append(f) + else: + columns.append(f.__name__) + arg = zip(columns, arg) + + results = {} + + for name, func in arg: + results[name] = self.aggregate(func) + + return DataFrame(results, columns=columns) + + def _wrap_aggregated_output(self, output, names=None): + # sort of a kludge + output = output[self.name] + index = self.grouper.result_index + + if names is not None: + return DataFrame(output, index=index, columns=names) + else: + return Series(output, index=index, name=self.name) + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + if len(keys) == 0: + return Series([]) + + def _get_index(): + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(keys, names=self.grouper.names) + else: + index = Index(keys, name=self.grouper.names[0]) + return index + + if isinstance(values[0], dict): + # # GH #823 + return DataFrame(values, index=keys).stack() + + if isinstance(values[0], (Series, dict)): + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + elif isinstance(values[0], DataFrame): + # possible that Series -> DataFrame by applied function + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + else: + return Series(values, index=_get_index()) + + def _aggregate_named(self, func, *args, **kwargs): + result = {} + + for name, group in self: + group.name = name + output = func(group, *args, **kwargs) + if isinstance(output, np.ndarray): + raise Exception('Must produce aggregated value') + result[name] = output + + return result + + def transform(self, func, *args, **kwargs): + """ + Call function producing a like-indexed Series on each group and return + a Series with the transformed values + + Parameters + ---------- + func : function + To apply to each group. Should return a Series with the same index + + Example + ------- + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + + Returns + ------- + transformed : Series + """ + result = self.obj.copy() + + if isinstance(func, basestring): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + for name, group in self: + object.__setattr__(group, 'name', name) + res = wrapper(group) + # result[group.index] = res + indexer = self.obj.index.get_indexer(group.index) + np.put(result, indexer, res) + + return result + +class NDFrameGroupBy(GroupBy): + + def _iterate_slices(self): + if self.axis == 0: + # kludge + if self._selection is None: + slice_axis = self.obj.columns + else: + slice_axis = self._selection_list + slicer = lambda x: self.obj[x] + else: + slice_axis = self.obj.index + slicer = self.obj.xs + + for val in slice_axis: + if val in self.exclusions: + continue + + yield val, slicer(val) + + def _cython_agg_general(self, how): + new_blocks = self._cython_agg_blocks(how) + return self._wrap_agged_blocks(new_blocks) + + def _wrap_agged_blocks(self, blocks): + obj = self._obj_with_exclusions + + new_axes = list(obj._data.axes) + + # more kludge + if self.axis == 0: + new_axes[0], new_axes[1] = new_axes[1], self.grouper.result_index + else: + new_axes[self.axis] = self.grouper.result_index + + mgr = BlockManager(blocks, new_axes) + + new_obj = type(obj)(mgr) + + return self._post_process_cython_aggregate(new_obj) + + _block_agg_axis = 0 + + def _cython_agg_blocks(self, how): + data, agg_axis = self._get_data_to_aggregate() + + new_blocks = [] + + for block in data.blocks: + values = block.values + if not issubclass(values.dtype.type, (np.number, np.bool_)): + continue + + values = com._ensure_float64(values) + result, names = self.grouper.aggregate(values, how, axis=agg_axis) + newb = make_block(result, block.items, block.ref_items) + new_blocks.append(newb) + + if len(new_blocks) == 0: + raise GroupByError('No numeric types to aggregate') + + return new_blocks + + def _get_data_to_aggregate(self): + obj = self._obj_with_exclusions + if self.axis == 0: + return obj.swapaxes(0, 1)._data, 1 + else: + return obj._data, self.axis + + def _post_process_cython_aggregate(self, obj): + # undoing kludge from below + if self.axis == 0: + obj = obj.swapaxes(0, 1) + return obj + + @cache_readonly + def _obj_with_exclusions(self): + if self._selection is not None: + return self.obj.reindex(columns=self._selection_list) + + if len(self.exclusions) > 0: + return self.obj.drop(self.exclusions, axis=1) + else: + return self.obj + + def aggregate(self, arg, *args, **kwargs): + """ + Aggregate using input function or dict of {column -> function} + + Parameters + ---------- + arg : function or dict + Function to use for aggregating groups. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. If + pass a dict, the keys must be DataFrame column names + + Returns + ------- + aggregated : DataFrame + """ + if isinstance(arg, basestring): + return getattr(self, arg)(*args, **kwargs) + + result = {} + if isinstance(arg, dict): + if self.axis != 0: # pragma: no cover + raise ValueError('Can only pass dict with axis=0') + + obj = self._obj_with_exclusions + + if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): + new_arg = {} + for k, v in arg.iteritems(): + if not isinstance(v, (tuple, list, dict)): + new_arg[k] = [v] + else: + new_arg[k] = v + arg = new_arg + + keys = [] + if self._selection is not None: + subset = obj[self._selection] + if isinstance(subset, DataFrame): + raise NotImplementedError + + for fname, agg_how in arg.iteritems(): + colg = SeriesGroupBy(subset, selection=self._selection, + grouper=self.grouper) + result[fname] = colg.aggregate(agg_how) + keys.append(fname) + else: + for col, agg_how in arg.iteritems(): + colg = SeriesGroupBy(obj[col], selection=col, + grouper=self.grouper) + result[col] = colg.aggregate(agg_how) + keys.append(col) + + if isinstance(result.values()[0], DataFrame): + from pandas.tools.merge import concat + result = concat([result[k] for k in keys], keys=keys, axis=1) + else: + result = DataFrame(result) + elif isinstance(arg, list): + return self._aggregate_multiple_funcs(arg) + else: + cyfunc = _intercept_cython(arg) + if cyfunc and not args and not kwargs: + return getattr(self, cyfunc)() + + if self.grouper.nkeys > 1: + return self._python_agg_general(arg, *args, **kwargs) + else: + result = self._aggregate_generic(arg, *args, **kwargs) + + if not self.as_index: + if isinstance(result.index, MultiIndex): + zipped = zip(result.index.levels, result.index.labels, + result.index.names) + for i, (lev, lab, name) in enumerate(zipped): + result.insert(i, name, com.ndtake(lev.values, lab)) + result = result.consolidate() + else: + values = result.index.values + name = self.grouper.groupings[0].name + result.insert(0, name, values) + result.index = np.arange(len(result)) + + return result + + def _aggregate_multiple_funcs(self, arg): + from pandas.tools.merge import concat + + if self.axis != 0: + raise NotImplementedError + + obj = self._obj_with_exclusions + + results = [] + keys = [] + for col in obj: + try: + colg = SeriesGroupBy(obj[col], selection=col, + grouper=self.grouper) + results.append(colg.aggregate(arg)) + keys.append(col) + except (TypeError, GroupByError): + pass + + result = concat(results, keys=keys, axis=1) + + return result + + def _aggregate_generic(self, func, *args, **kwargs): + assert(self.grouper.nkeys == 1) + + axis = self.axis + obj = self._obj_with_exclusions + + result = {} + if axis != obj._het_axis: + try: + for name in self.indices: + data = self.get_group(name, obj=obj) + result[name] = func(data, *args, **kwargs) + except Exception: + return self._aggregate_item_by_item(func, *args, **kwargs) + else: + for name in self.indices: + try: + data = self.get_group(name, obj=obj) + result[name] = func(data, *args, **kwargs) + except Exception: + wrapper = lambda x: func(x, *args, **kwargs) + result[name] = data.apply(wrapper, axis=axis) + + return self._wrap_generic_output(result, obj) + + def _wrap_aggregated_output(self, output, names=None): + raise NotImplementedError + + def _aggregate_item_by_item(self, func, *args, **kwargs): + # only for axis==0 + + obj = self._obj_with_exclusions + result = {} + cannot_agg = [] + for item in obj: + try: + colg = SeriesGroupBy(obj[item], selection=item, + grouper=self.grouper) + result[item] = colg.aggregate(func, *args, **kwargs) + except (ValueError, TypeError): + cannot_agg.append(item) + continue + + result_columns = obj.columns + if cannot_agg: + result_columns = result_columns.drop(cannot_agg) + + return DataFrame(result, columns=result_columns) + + def _decide_output_index(self, output, labels): + if len(output) == len(labels): + output_keys = labels + else: + output_keys = sorted(output) + try: + output_keys.sort() + except Exception: # pragma: no cover + pass + + if isinstance(labels, MultiIndex): + output_keys = MultiIndex.from_tuples(output_keys, + names=labels.names) + + return output_keys + + def _wrap_applied_output(self, keys, values, not_indexed_same=False): + from pandas.core.index import _all_indexes_same + + if len(keys) == 0: + # XXX + return DataFrame({}) + + key_names = self.grouper.names + + if isinstance(values[0], DataFrame): + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + else: + if len(self.grouper.groupings) > 1: + key_index = MultiIndex.from_tuples(keys, names=key_names) + else: + ping = self.grouper.groupings[0] + if len(keys) == ping.ngroups: + key_index = ping.group_index + key_index.name = key_names[0] + + key_lookup = Index(keys) + indexer = key_lookup.get_indexer(key_index) + + # reorder the values + values = [values[i] for i in indexer] + else: + key_index = Index(keys, name=key_names[0]) + + if isinstance(values[0], np.ndarray): + if (isinstance(values[0], Series) and + not _all_indexes_same([x.index for x in values])): + return self._concat_objects(keys, values, + not_indexed_same=not_indexed_same) + + if self.axis == 0: + stacked_values = np.vstack([np.asarray(x) + for x in values]) + columns = values[0].index + index = key_index + else: + stacked_values = np.vstack([np.asarray(x) + for x in values]).T + index = values[0].index + columns = key_index + return DataFrame(stacked_values, index=index, + columns=columns) + else: + return Series(values, index=key_index) + + def transform(self, func, *args, **kwargs): + """ + Call function producing a like-indexed DataFrame on each group and + return a DataFrame having the same indexes as the original object + filled with the transformed values + + Parameters + ---------- + f : function + Function to apply to each subframe + + Note + ---- + Each subframe is endowed the attribute 'name' in case you need to know + which group you are working on. + + Example + -------- + >>> grouped = df.groupby(lambda x: mapping[x]) + >>> grouped.transform(lambda x: (x - x.mean()) / x.std()) + """ + from pandas.tools.merge import concat + + applied = [] + + obj = self._obj_with_exclusions + gen = self.grouper.get_iterator(obj, axis=self.axis) + + if isinstance(func, basestring): + wrapper = lambda x: getattr(x, func)(*args, **kwargs) + else: + wrapper = lambda x: func(x, *args, **kwargs) + + for name, group in gen: + object.__setattr__(group, 'name', name) + + try: + res = group.apply(wrapper, axis=self.axis) + except TypeError: + return self._transform_item_by_item(obj, wrapper) + except Exception: # pragma: no cover + res = wrapper(group) + + # broadcasting + if isinstance(res, Series): + if res.index is obj.index: + group.T.values[:] = res + else: + group.values[:] = res + + applied.append(group) + else: + applied.append(res) + + concat_index = obj.columns if self.axis == 0 else obj.index + concatenated = concat(applied, join_axes=[concat_index], + axis=self.axis, verify_integrity=False) + return concatenated.reindex_like(obj) + + def _transform_item_by_item(self, obj, wrapper): + # iterate through columns + output = {} + inds = [] + for i, col in enumerate(obj): + try: + output[col] = self[col].transform(wrapper) + inds.append(i) + except Exception: + pass + + if len(output) == 0: # pragma: no cover + raise TypeError('Transform function invalid for data types') + + columns = obj.columns + if len(output) < len(obj.columns): + columns = columns.take(inds) + + return DataFrame(output, index=obj.index, columns=columns) + + +class DataFrameGroupBy(NDFrameGroupBy): + + _block_agg_axis = 1 + + def __getitem__(self, key): + if self._selection is not None: + raise Exception('Column(s) %s already selected' % self._selection) + + if isinstance(key, (list, tuple, np.ndarray)) or not self.as_index: + return DataFrameGroupBy(self.obj, self.grouper, selection=key, + grouper=self.grouper, + exclusions=self.exclusions, + as_index=self.as_index) + else: + if key not in self.obj: # pragma: no cover + raise KeyError(str(key)) + # kind of a kludge + return SeriesGroupBy(self.obj[key], selection=key, + grouper=self.grouper, + exclusions=self.exclusions) + + def _wrap_generic_output(self, result, obj): + result_index = self.grouper.levels[0] + + if result: + if self.axis == 0: + result = DataFrame(result, index=obj.columns, + columns=result_index).T + else: + result = DataFrame(result, index=obj.index, + columns=result_index) + else: + result = DataFrame(result) + + return result + + def _get_data_to_aggregate(self): + obj = self._obj_with_exclusions + if self.axis == 1: + return obj.T._data, 1 + else: + return obj._data, 1 + + def _wrap_aggregated_output(self, output, names=None): + agg_axis = 0 if self.axis == 1 else 1 + agg_labels = self._obj_with_exclusions._get_axis(agg_axis) + + output_keys = self._decide_output_index(output, agg_labels) + + if not self.as_index: + result = DataFrame(output, columns=output_keys) + group_levels = self.grouper.get_group_levels() + zipped = zip(self.grouper.names, group_levels) + + for i, (name, labels) in enumerate(zipped): + result.insert(i, name, labels) + result = result.consolidate() + else: + index = self.grouper.result_index + result = DataFrame(output, index=index, columns=output_keys) + + if self.axis == 1: + result = result.T + + return result + + def _wrap_agged_blocks(self, blocks): + obj = self._obj_with_exclusions + + if self.axis == 0: + agg_labels = obj.columns + else: + agg_labels = obj.index + + if sum(len(x.items) for x in blocks) == len(agg_labels): + output_keys = agg_labels + else: + all_items = [] + for b in blocks: + all_items.extend(b.items) + output_keys = agg_labels[agg_labels.isin(all_items)] + + for blk in blocks: + blk.set_ref_items(output_keys, maybe_rename=False) + + if not self.as_index: + index = np.arange(blocks[0].values.shape[1]) + mgr = BlockManager(blocks, [output_keys, index]) + result = DataFrame(mgr) + + group_levels = self.grouper.get_group_levels() + zipped = zip(self.grouper.names, group_levels) + + for i, (name, labels) in enumerate(zipped): + result.insert(i, name, labels) + result = result.consolidate() + else: + index = self.grouper.result_index + mgr = BlockManager(blocks, [output_keys, index]) + result = DataFrame(mgr) + + if self.axis == 1: + result = result.T + + return result + +from pandas.tools.plotting import boxplot_frame_groupby +DataFrameGroupBy.boxplot = boxplot_frame_groupby + +class PanelGroupBy(NDFrameGroupBy): + + def _iterate_slices(self): + if self.axis == 0: + # kludge + if self._selection is None: + slice_axis = self.obj.items + else: + slice_axis = self._selection_list + slicer = lambda x: self.obj[x] + else: + raise NotImplementedError + + for val in slice_axis: + if val in self.exclusions: + continue + + yield val, slicer(val) + + def aggregate(self, arg, *args, **kwargs): + """ + Aggregate using input function or dict of {column -> function} + + Parameters + ---------- + arg : function or dict + Function to use for aggregating groups. If a function, must either + work when passed a Panel or when passed to Panel.apply. If + pass a dict, the keys must be DataFrame column names + + Returns + ------- + aggregated : Panel + """ + if isinstance(arg, basestring): + return getattr(self, arg)(*args, **kwargs) + + return self._aggregate_generic(arg, *args, **kwargs) + + def _wrap_generic_output(self, result, obj): + new_axes = list(obj.axes) + new_axes[self.axis] = self.grouper.result_index + + result = Panel._from_axes(result, new_axes) + + if self.axis > 0: + result = result.swapaxes(0, self.axis) + + return result + + def _aggregate_item_by_item(self, func, *args, **kwargs): + obj = self._obj_with_exclusions + result = {} + cannot_agg = [] + + if self.axis > 0: + for item in obj: + try: + itemg = DataFrameGroupBy(obj[item], + axis=self.axis - 1, + grouper=self.grouper) + result[item] = itemg.aggregate(func, *args, **kwargs) + except (ValueError, TypeError): + raise + new_axes = list(obj.axes) + new_axes[self.axis] = self.grouper.result_index + return Panel._from_axes(result, new_axes) + else: + raise NotImplementedError + + def _wrap_aggregated_output(self, output, names=None): + raise NotImplementedError + + +class NDArrayGroupBy(GroupBy): + pass + + +#---------------------------------------------------------------------- +# Grouping generator for BlockManager + +def generate_groups(data, group_index, ngroups, axis=0, factory=lambda x: x): + """ + Parameters + ---------- + data : BlockManager + + Returns + ------- + generator + """ + group_index = com._ensure_int64(group_index) + + indexer = lib.groupsort_indexer(group_index, ngroups)[0] + group_index = com.ndtake(group_index, indexer) + + if isinstance(data, BlockManager): + # this is sort of wasteful but... + sorted_axis = data.axes[axis].take(indexer) + sorted_data = data.reindex_axis(sorted_axis, axis=axis) + if isinstance(data, Series): + sorted_axis = data.index.take(indexer) + sorted_data = data.reindex(sorted_axis) + elif isinstance(data, DataFrame): + sorted_data = data.take(indexer, axis=axis) + + if isinstance(sorted_data, DataFrame): + def _get_slice(slob): + if axis == 0: + return sorted_data[slob] + else: + return sorted_data.ix[:, slob] + elif isinstance(sorted_data, BlockManager): + def _get_slice(slob): + return factory(sorted_data.get_slice(slob, axis=axis)) + elif isinstance(sorted_data, Series): + def _get_slice(slob): + return sorted_data._get_values(slob) + else: # pragma: no cover + def _get_slice(slob): + return sorted_data[slob] + + starts, ends = lib.generate_slices(group_index, ngroups) + + for i, (start, end) in enumerate(zip(starts, ends)): + # Since I'm now compressing the group ids, it's now not "possible" to + # produce empty slices because such groups would not be observed in the + # data + assert(start < end) + yield i, _get_slice(slice(start, end)) + +def get_group_index(label_list, shape): + """ + For the particular label_list, gets the offsets into the hypothetical list + representing the totally ordered cartesian product of all possible label + combinations. + """ + if len(label_list) == 1: + return label_list[0] + + n = len(label_list[0]) + group_index = np.zeros(n, dtype=np.int64) + mask = np.zeros(n, dtype=bool) + for i in xrange(len(shape)): + stride = np.prod([x for x in shape[i+1:]], dtype=np.int64) + group_index += com._ensure_int64(label_list[i]) * stride + mask |= label_list[i] < 0 + + np.putmask(group_index, mask, -1) + return group_index + +_INT64_MAX = np.iinfo(np.int64).max +def _int64_overflow_possible(shape): + the_prod = 1L + for x in shape: + the_prod *= long(x) + + return the_prod >= _INT64_MAX + +def decons_group_index(comp_labels, shape): + # reconstruct labels + label_list = [] + factor = 1 + y = 0 + x = comp_labels + for i in reversed(xrange(len(shape))): + labels = (x - y) % (factor * shape[i]) // factor + np.putmask(labels, comp_labels < 0, -1) + label_list.append(labels) + y = labels * factor + factor *= shape[i] + return label_list[::-1] + + +def _indexer_from_factorized(labels, shape, compress=True): + if _int64_overflow_possible(shape): + indexer = np.lexsort(np.array(labels[::-1])) + return indexer + + group_index = get_group_index(labels, shape) + + if compress: + comp_ids, obs_ids = _compress_group_index(group_index) + max_group = len(obs_ids) + else: + comp_ids = group_index + max_group = np.prod(shape) + + indexer, _ = lib.groupsort_indexer(comp_ids.astype(np.int64), max_group) + + return indexer + + +def _lexsort_indexer(keys): + labels = [] + shape = [] + for key in keys: + rizer = lib.Factorizer(len(key)) + + if not key.dtype == np.object_: + key = key.astype('O') + + ids, _ = rizer.factorize(key, sort=True) + labels.append(ids) + shape.append(len(rizer.uniques)) + return _indexer_from_factorized(labels, shape) + +class _KeyMapper(object): + """ + Ease my suffering. Map compressed group id -> key tuple + """ + def __init__(self, comp_ids, ngroups, labels, levels): + self.levels = levels + self.labels = labels + self.comp_ids = comp_ids.astype(np.int64) + + self.k = len(labels) + self.tables = [lib.Int64HashTable(ngroups) for _ in range(self.k)] + + self._populate_tables() + + def _populate_tables(self): + for labs, table in zip(self.labels, self.tables): + table.map(self.comp_ids, labs.astype(np.int64)) + + def get_key(self, comp_id): + return tuple(level[table.get_item(comp_id)] + for table, level in zip(self.tables, self.levels)) + + +def _get_indices_dict(label_list, keys): + shape = [len(x) for x in keys] + group_index = get_group_index(label_list, shape) + + sorter, _ = lib.groupsort_indexer(com._ensure_int64(group_index), + np.prod(shape)) + + sorter_int = com._ensure_platform_int(sorter) + + sorted_labels = [lab.take(sorter_int) for lab in label_list] + group_index = group_index.take(sorter_int) + + return lib.indices_fast(sorter, group_index, keys, sorted_labels) + +#---------------------------------------------------------------------- +# sorting levels...cleverly? + +def _compress_group_index(group_index, sort=True): + """ + Group_index is offsets into cartesian product of all possible labels. This + space can be huge, so this function compresses it, by computing offsets + (comp_ids) into the list of unique labels (obs_group_ids). + """ + + uniques = [] + table = lib.Int64HashTable(min(1000000, len(group_index))) + + group_index = com._ensure_int64(group_index) + + # note, group labels come out ascending (ie, 1,2,3 etc) + comp_ids = table.get_labels_groupby(group_index, uniques) + + # these are the unique ones we observed, in the order we observed them + obs_group_ids = np.array(uniques, dtype=np.int64) + + if sort and len(obs_group_ids) > 0: + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) + + return comp_ids, obs_group_ids + +def _reorder_by_uniques(uniques, labels): + # sorter is index where elements ought to go + sorter = uniques.argsort() + + # reverse_indexer is where elements came from + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + mask = labels < 0 + + # move labels to right locations (ie, unsort ascending labels) + labels = com.ndtake(reverse_indexer, labels) + np.putmask(labels, mask, -1) + + # sort observed ids + uniques = com.ndtake(uniques, sorter) + + return uniques, labels + +import __builtin__ + +_func_table = { + __builtin__.sum: np.sum +} + +_cython_table = { + __builtin__.sum: 'sum', + np.sum: 'sum', + np.mean: 'mean', + np.prod: 'prod', + np.std: 'std', + np.var: 'var' +} + +def _intercept_function(func): + return _func_table.get(func, func) + +def _intercept_cython(func): + return _cython_table.get(func) + +def _groupby_indices(values): + return lib.groupby_indices(com._ensure_object(values)) + +def numpy_groupby(data, labels, axis=0): + s = np.argsort(labels) + keys, inv = np.unique(labels, return_inverse=True) + i = inv.take(s) + groups_at = np.where(i != np.concatenate(([-1], i[:-1])))[0] + ordered_data = data.take(s, axis=axis) + group_sums = np.add.reduceat(ordered_data, groups_at, axis=axis) + + return group_sums + +#----------------------------------------------------------------------- +# Helper functions + + +from pandas.util import py3compat +import sys + +def install_ipython_completers(): # pragma: no cover + """Register the DataFrame type with IPython's tab completion machinery, so + that it knows about accessing column names as attributes.""" + from IPython.utils.generics import complete_object + + @complete_object.when_type(DataFrameGroupBy) + def complete_dataframe(obj, prev_completions): + return prev_completions + [c for c in obj.obj.columns \ + if isinstance(c, basestring) and py3compat.isidentifier(c)] + + +# Importing IPython brings in about 200 modules, so we want to avoid it unless +# we're in IPython (when those modules are loaded anyway). +if "IPython" in sys.modules: # pragma: no cover + try: + install_ipython_completers() + except Exception: + pass + + diff --git a/pandas/core/index.py b/pandas/core/index.py new file mode 100644 index 00000000..38437828 --- /dev/null +++ b/pandas/core/index.py @@ -0,0 +1,2497 @@ +# pylint: disable=E1101,E1103,W0232 + +from datetime import time + +from itertools import izip +import weakref + +import numpy as np + +from pandas.core.common import ndtake +from pandas.util.decorators import cache_readonly +from pandas.util import py3compat +import pandas.core.common as com +import pandas.lib as lib +import pandas._algos as _algos + + +__all__ = ['Index'] + + +def _indexOp(opname): + """ + Wrapper function for index comparison operations, to avoid + code duplication. + """ + def wrapper(self, other): + func = getattr(self.view(np.ndarray), opname) + result = func(other) + try: + return result.view(np.ndarray) + except: # pragma: no cover + return result + return wrapper + + +class InvalidIndexError(Exception): + pass + +_o_dtype = np.dtype(object) + +class Index(np.ndarray): + """ + Immutable ndarray implementing an ordered, sliceable set. The basic object + storing axis labels for all pandas objects + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: object) + copy : bool + Make a copy of input ndarray + + Note + ---- + An Index instance can **only** contain hashable objects + """ + # To hand over control to subclasses + _join_precedence = 1 + + # Cython methods + _groupby = _algos.groupby_object + _arrmap = _algos.arrmap_object + _left_indexer_unique = _algos.left_join_indexer_unique_object + _left_indexer = _algos.left_join_indexer_object + _inner_indexer = _algos.inner_join_indexer_object + _outer_indexer = _algos.outer_join_indexer_object + + _box_scalars = False + + name = None + asi8 = None + + _engine_type = lib.ObjectEngine + + def __new__(cls, data, dtype=None, copy=False, name=None): + if isinstance(data, np.ndarray): + if issubclass(data.dtype.type, np.datetime64): + from pandas.tseries.index import DatetimeIndex + result = DatetimeIndex(data, copy=copy, name=name) + if dtype is not None and _o_dtype == dtype: + return Index(result.to_pydatetime(), dtype=_o_dtype) + else: + return result + + if dtype is not None: + try: + data = np.array(data, dtype=dtype, copy=copy) + except TypeError: + pass + + if issubclass(data.dtype.type, np.integer): + return Int64Index(data, copy=copy, name=name) + + subarr = com._ensure_object(data) + elif np.isscalar(data): + raise ValueError('Index(...) must be called with a collection ' + 'of some kind, %s was passed' % repr(data)) + else: + # other iterable of some kind + subarr = com._asarray_tuplesafe(data, dtype=object) + + if dtype is None: + if (lib.is_datetime_array(subarr) + or lib.is_datetime64_array(subarr) + or lib.is_timestamp_array(subarr)): + from pandas.tseries.index import DatetimeIndex + return DatetimeIndex(subarr, copy=copy, name=name) + + if lib.is_integer_array(subarr): + return Int64Index(subarr.astype('i8'), name=name) + + subarr = subarr.view(cls) + subarr.name = name + return subarr + + def __array_finalize__(self, obj): + self.name = getattr(obj, 'name', None) + + def _shallow_copy(self): + return self.view(type(self)) + + def __repr__(self): + try: + result = np.ndarray.__repr__(self) + except UnicodeEncodeError: + result = 'Index([%s])' % (', '.join([repr(x) for x in self])) + + return result + + def astype(self, dtype): + return Index(self.values.astype(dtype), name=self.name, + dtype=dtype) + + def to_datetime(self, dayfirst=False): + """ + For an Index containing strings or datetime.datetime objects, attempt + conversion to DatetimeIndex + """ + from pandas.tseries.index import DatetimeIndex + if self.inferred_type == 'string': + from dateutil.parser import parse + parser = lambda x: parse(x, dayfirst=dayfirst) + parsed = lib.try_parse_dates(self.values, parser=parser) + return DatetimeIndex(parsed) + elif isinstance(self, DatetimeIndex): + return self.copy() + else: + return DatetimeIndex(self.values) + + def _assert_can_do_setop(self, other): + return True + + @property + def dtype(self): + return self.values.dtype + + @property + def nlevels(self): + return 1 + + # for compat with multindex code + + def _get_names(self): + return [self.name] + + def _set_names(self, values): + assert(len(values) == 1) + self.name = values[0] + + names = property(fset=_set_names, fget=_get_names) + + @property + def _constructor(self): + return Index + + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False + + def summary(self, name=None): + if len(self) > 0: + index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) + + def __str__(self): + try: + return np.array_repr(self.values) + except UnicodeError: + converted = u','.join(unicode(x) for x in self.values) + return u'%s([%s], dtype=''%s'')' % (type(self).__name__, converted, + str(self.values.dtype)) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values + + @property + def values(self): + return np.asarray(self) + + @property + def is_monotonic(self): + return self._engine.is_monotonic + + @cache_readonly + def is_unique(self): + return self._engine.is_unique + + def is_numeric(self): + return self.inferred_type in ['integer', 'floating'] + + def get_duplicates(self): + from collections import defaultdict + counter = defaultdict(lambda: 0) + for k in self.values: + counter[k] += 1 + return sorted(k for k, v in counter.iteritems() if v > 1) + + _get_duplicates = get_duplicates + + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self.values, len(self)) + + def _get_level_number(self, level): + if not isinstance(level, int): + assert(level == self.name) + level = 0 + return level + + @cache_readonly + def inferred_type(self): + return lib.infer_dtype(self) + + def is_type_compatible(self, typ): + return typ == self.inferred_type + + @cache_readonly + def is_all_dates(self): + return self.inferred_type == 'datetime' + + def __iter__(self): + return iter(self.values) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = self.name, + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if len(state) == 2: + nd_state, own_state = state + np.ndarray.__setstate__(self, nd_state) + self.name = own_state[0] + else: # pragma: no cover + np.ndarray.__setstate__(self, state) + + def __deepcopy__(self, memo={}): + """ + Index is not mutable, so disabling deepcopy + """ + return self + + def __contains__(self, key): + hash(key) + # work around some kind of odd cython bug + try: + return key in self._engine + except TypeError: + return False + + def __hash__(self): + return hash(self.view(np.ndarray)) + + def __setitem__(self, key, value): + """Disable the setting of values.""" + raise Exception(str(self.__class__) + ' object is immutable') + + def __getitem__(self, key): + """Override numpy.ndarray's __getitem__ method to work as desired""" + arr_idx = self.view(np.ndarray) + if np.isscalar(key): + return arr_idx[key] + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + + result = arr_idx[key] + if result.ndim > 1: + return result + + return Index(result, name=self.name) + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + name = self.name + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if isinstance(obj, Index) and obj.name != name: + name = None + break + + to_concat = _ensure_compat_concat(to_concat) + to_concat = [x.values if isinstance(x, Index) else x + for x in to_concat] + + return Index(np.concatenate(to_concat), name=name) + + def take(self, indexer, axis=0): + """ + Analogous to ndarray.take + """ + indexer = com._ensure_platform_int(indexer) + taken = self.view(np.ndarray).take(indexer) + return self._constructor(taken, name=self.name) + + def format(self, name=False): + """ + Render a string representation of the Index + """ + from pandas.core.format import format_array + + header = [] + if name: + header.append(str(self.name) if self.name is not None else '') + + if self.is_all_dates: + zero_time = time(0, 0) + result = [] + for dt in self: + if dt.time() != zero_time or dt.tzinfo is not None: + return header + ['%s' % x for x in self] + result.append('%d-%.2d-%.2d' % (dt.year, dt.month, dt.day)) + return header + result + + values = self.values + + if values.dtype == np.object_: + values = lib.maybe_convert_objects(values, safe=1) + + if values.dtype == np.object_: + result = com._stringify_seq(values) + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self is other: + return True + + if not isinstance(other, Index): + return False + + if type(other) != Index: + return other.equals(self) + + return np.array_equal(self, other) + + def asof(self, label): + """ + For a sorted index, return the most recent label up to and including + the passed label. Return NaN if not found + """ + if label not in self: + loc = self.searchsorted(label, side='left') + if loc > 0: + return self[loc - 1] + else: + return np.nan + + return label + + def asof_locs(self, where, mask): + """ + where : array of timestamps + mask : array of booleans where data is not NA + + """ + locs = self.values[mask].searchsorted(where.values, side='right') + + locs = np.where(locs > 0, locs - 1, 0) + result = np.arange(len(self))[mask].take(locs) + + first = mask.argmax() + result[(locs == 0) & (where < self.values[first])] = -1 + + return result + + def order(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index + """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] + + sorted_index = self.take(_as) + + if return_indexer: + return sorted_index, _as + else: + return sorted_index + + def sort(self, *args, **kwargs): + raise Exception('Cannot sort an Index object') + + def shift(self, periods=1, freq=None): + """ + Shift Index containing datetime objects by input number of periods and + DateOffset + + Returns + ------- + shifted : Index + """ + if periods == 0: + # OK because immutable + return self + + offset = periods * freq + return Index([idx + offset for idx in self]) + + def argsort(self, *args, **kwargs): + """ + See docstring for ndarray.argsort + """ + return self.view(np.ndarray).argsort(*args, **kwargs) + + def __add__(self, other): + if isinstance(other, Index): + return self.union(other) + else: + return Index(self.view(np.ndarray) + other) + + __eq__ = _indexOp('__eq__') + __ne__ = _indexOp('__ne__') + __lt__ = _indexOp('__lt__') + __gt__ = _indexOp('__gt__') + __le__ = _indexOp('__le__') + __ge__ = _indexOp('__ge__') + + def __sub__(self, other): + return self.diff(other) + + def __and__(self, other): + return self.intersection(other) + + def __or__(self, other): + return self.union(other) + + def union(self, other): + """ + Form the union of two Index objects and sorts if possible + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + union : Index + """ + if not hasattr(other, '__iter__'): + raise Exception('Input must be iterable!') + + if len(other) == 0 or self.equals(other): + return self + + if len(self) == 0: + return _ensure_index(other) + + self._assert_can_do_setop(other) + + if self.dtype != other.dtype: + this = self.astype('O') + other = other.astype('O') + return this.union(other) + + if self.is_monotonic and other.is_monotonic: + try: + result = self._outer_indexer(self, other.values)[0] + except TypeError: + # incomparable objects + result = list(self.values) + + # worth making this faster? a very unusual case + value_set = set(self.values) + result.extend([x for x in other.values if x not in value_set]) + else: + indexer = self.get_indexer(other) + indexer = (indexer == -1).nonzero()[0] + + if len(indexer) > 0: + other_diff = ndtake(other.values, indexer) + result = np.concatenate((self.values, other_diff)) + try: + result.sort() + except Exception: + pass + else: + # contained in + try: + result = np.sort(self.values) + except TypeError: # pragma: no cover + result = self.values + + # for subclasses + return self._wrap_union_result(other, result) + + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + return type(self)(data=result, name=name) + + def intersection(self, other): + """ + Form the intersection of two Index objects. Sortedness of the result is + not guaranteed + + Parameters + ---------- + other : Index or array-like + + Returns + ------- + intersection : Index + """ + if not hasattr(other, '__iter__'): + raise Exception('Input must be iterable!') + + self._assert_can_do_setop(other) + + other = _ensure_index(other) + + if self.equals(other): + return self + + if self.dtype != other.dtype: + this = self.astype('O') + other = other.astype('O') + return this.intersection(other) + + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(self, other.values)[0] + return self._wrap_union_result(other, result) + except TypeError: + pass + + indexer = self.get_indexer(other.values) + indexer = indexer.take((indexer != -1).nonzero()[0]) + return self.take(indexer) + + def diff(self, other): + """ + Compute sorted set difference of two Index objects + + Notes + ----- + One can do either of these and achieve the same result + + >>> index - index2 + >>> index.diff(index2) + + Returns + ------- + diff : Index + """ + + if not hasattr(other, '__iter__'): + raise Exception('Input must be iterable!') + + if self.equals(other): + return Index([]) + + if not isinstance(other, Index): + other = np.asarray(other) + + theDiff = sorted(set(self) - set(other)) + return Index(theDiff) + + def unique(self): + """ + Return array of unique values in the Index. Significantly faster than + numpy.unique + + Returns + ------- + uniques : ndarray + """ + from pandas.core.nanops import unique1d + return unique1d(self.values) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + return self._engine.get_loc(key) + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + try: + return self._engine.get_value(series, key) + except KeyError, e1: + if len(self) > 0 and self.inferred_type == 'integer': + raise + + try: + return lib.get_value_box(series, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if com.is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if np.isscalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) + + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + self._engine.set_value(arr, key, value) + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : Index + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer, mask = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + >>> new_values[-mask] = np.nan + + Returns + ------- + (indexer, mask) : (ndarray, ndarray) + """ + method = self._get_method(method) + target = _ensure_index(target) + + pself, ptarget = self._possibly_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer(ptarget, method=method, limit=limit) + + if self.dtype != target.dtype: + this = self.astype(object) + target = target.astype(object) + return this.get_indexer(target, method=method, limit=limit) + + if not self.is_unique: + raise Exception('Reindexing only valid with uniquely valued Index ' + 'objects') + + if method == 'pad': + assert(self.is_monotonic) + indexer = self._engine.get_pad_indexer(target.values, limit) + elif method == 'backfill': + assert(self.is_monotonic) + indexer = self._engine.get_backfill_indexer(target.values, limit) + elif method is None: + indexer = self._engine.get_indexer(target.values) + else: + raise ValueError('unrecognized method: %s' % method) + + return com._ensure_platform_int(indexer) + + def _possibly_promote(self, other): + # A hack, but it works + from pandas.tseries.index import DatetimeIndex + if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + return DatetimeIndex(self), other + return self, other + + def groupby(self, to_groupby): + return self._groupby(self.values, to_groupby) + + def map(self, mapper): + return self._arrmap(self.values, mapper) + + def isin(self, values): + """ + Compute boolean array of whether each index value is found in the + passed set of values + + Parameters + ---------- + values : set or sequence of values + + Returns + ------- + is_contained : ndarray (boolean dtype) + """ + value_set = set(values) + return lib.ismember(self._array_values(), value_set) + + def _array_values(self): + return self + + def _get_method(self, method): + if method: + method = method.lower() + + aliases = { + 'ffill': 'pad', + 'bfill': 'backfill' + } + return aliases.get(method, method) + + def reindex(self, target, method=None, level=None, limit=None): + """ + For Index, simply returns the new index and the results of + get_indexer. Provided here to enable an interface that is amenable for + subclasses of Index whose internals are different (like MultiIndex) + + Returns + ------- + (new_index, indexer, mask) : tuple + """ + target = _ensure_index(target) + if level is not None: + if method is not None: + raise ValueError('Fill method not supported if level passed') + _, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) + else: + if self.equals(target): + indexer = None + else: + indexer = self.get_indexer(target, method=method, + limit=limit) + return target, indexer + + def join(self, other, how='left', level=None, return_indexers=False): + """ + Internal API method. Compute join_index and indexers to conform data + structures to the new index. + + Parameters + ---------- + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : + return_indexers : boolean, default False + + Returns + ------- + join_index, (left_indexer, right_indexer) + """ + if (level is not None and (isinstance(self, MultiIndex) or + isinstance(other, MultiIndex))): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) + + other = _ensure_index(other) + + if len(other) == 0 and how in ('left', 'outer'): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index + + if len(self) == 0 and how in ('right', 'outer'): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index + + if self._join_precedence < other._join_precedence: + how = {'right': 'left', 'left': 'right'}.get(how, how) + result = other.join(self, how=how, level=level, + return_indexers=return_indexers) + if return_indexers: + x, y, z = result + result = x, z, y + return result + + if self.dtype != other.dtype: + this = self.astype('O') + other = other.astype('O') + return this.join(other, how=how, + return_indexers=return_indexers) + + _validate_join_method(how) + + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + else: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif self.is_monotonic and other.is_monotonic: + try: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + except TypeError: + pass + + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + elif how == 'inner': + join_index = self.intersection(other) + elif how == 'outer': + join_index = self.union(other) + + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index + + def _join_non_unique(self, other, how='left', return_indexers=False): + from pandas.tools.merge import _get_join_indexers + + left_idx, right_idx = _get_join_indexers([self.values], [other.values], + how=how, sort=True) + + left_idx = com._ensure_platform_int(left_idx) + right_idx = com._ensure_platform_int(right_idx) + + join_index = self.values.take(left_idx) + mask = left_idx == -1 + np.putmask(join_index, mask, other.values.take(right_idx)) + + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index + + def _join_level(self, other, level, how='left', return_indexers=False): + """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. The order of the data indexed by + the MultiIndex will not be changed (currently) + """ + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise Exception('Join on level between two MultiIndex objects ' + 'is ambiguous') + + left, right = self, other + + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {'right': 'left', 'left': 'right'}.get(how, how) + + level = left._get_level_number(level) + old_level = left.levels[level] + + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) + + if left_lev_indexer is not None: + left_lev_indexer = com._ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) + + new_lev_labels = ndtake(rev_indexer, left.labels[level]) + omit_mask = new_lev_labels != -1 + + new_labels = list(left.labels) + new_labels[level] = new_lev_labels + + if not omit_mask.all(): + new_labels = [lab[omit_mask] for lab in new_labels] + + new_levels = list(left.levels) + new_levels[level] = new_level + + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=left.names) + left_indexer = np.arange(len(left))[new_lev_labels != -1] + else: + join_index = left + left_indexer = None + + if right_lev_indexer is not None: + right_indexer = ndtake(right_lev_indexer, + join_index.labels[level]) + else: + right_indexer = join_index.labels[level] + + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer + + if return_indexers: + return join_index, left_indexer, right_indexer + else: + return join_index + + def _join_monotonic(self, other, how='left', return_indexers=False): + if self.equals(other): + ret_index = other if how == 'right' else self + if return_indexers: + return ret_index, None, None + else: + return ret_index + + sv = self.values + ov = other.values + + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == 'left': + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == 'right': + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv,ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + else: + if how == 'left': + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == 'right': + join_index, ridx, lidx = self._left_indexer(other, self) + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + + if return_indexers: + return join_index, lidx, ridx + else: + return join_index + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + return Index(joined, name=name) + + def slice_locs(self, start=None, end=None): + """ + For an ordered Index, compute the slice locations for input labels + + Parameters + ---------- + start : label, default None + If None, defaults to the beginning + end : label + If None, defaults to the end + + Returns + ------- + (begin, end) : (int, int) + + Notes + ----- + This function assumes that the data is sorted, so use at your own peril + """ + if start is None: + beg_slice = 0 + else: + try: + beg_slice = self.get_loc(start) + except KeyError: + if self.is_monotonic: + beg_slice = self.searchsorted(start, side='left') + else: + raise + + if end is None: + end_slice = len(self) + else: + try: + end_slice = self.get_loc(end) + 1 + except KeyError: + if self.is_monotonic: + end_slice = self.searchsorted(end, side='right') + else: + raise + + return beg_slice, end_slice + + def delete(self, loc): + """ + Make new Index with passed location deleted + + Returns + ------- + new_index : Index + """ + arr = np.delete(np.asarray(self), loc) + return Index(arr) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + """ + index = np.asarray(self) + # because numpy is fussy with tuples + item_idx = Index([item], dtype=index.dtype) + new_index = np.concatenate((index[:loc], item_idx, index[loc:])) + return Index(new_index, name=self.name) + + def drop(self, labels): + """ + Make new Index with passed list of labels deleted + + Parameters + ---------- + labels : array-like + + Returns + ------- + dropped : Index + """ + labels = com._index_labels_to_array(labels) + indexer = self.get_indexer(labels) + mask = indexer == -1 + if mask.any(): + raise ValueError('labels %s not contained in axis' % labels[mask]) + return self.delete(indexer) + + def copy(self, order='C'): + """ + Overridden ndarray.copy to copy over attributes + + Returns + ------- + cp : Index + Returns view on same base ndarray + """ + cp = self.view(np.ndarray).view(type(self)) + cp.__dict__.update(self.__dict__) + return cp + + +class Int64Index(Index): + + _groupby = _algos.groupby_int64 + _arrmap = _algos.arrmap_int64 + _left_indexer_unique = _algos.left_join_indexer_unique_int64 + _left_indexer = _algos.left_join_indexer_int64 + _inner_indexer = _algos.inner_join_indexer_int64 + _outer_indexer = _algos.outer_join_indexer_int64 + + _engine_type = lib.Int64Engine + + def __new__(cls, data, dtype=None, copy=False, name=None): + if not isinstance(data, np.ndarray): + if np.isscalar(data): + raise ValueError('Index(...) must be called with a collection ' + 'of some kind, %s was passed' % repr(data)) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + data = np.asarray(data) + + if issubclass(data.dtype.type, basestring): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to int') + elif issubclass(data.dtype.type, np.integer): + subarr = np.array(data, dtype=np.int64, copy=copy) + else: + subarr = np.array(data, dtype=np.int64, copy=copy) + if len(data) > 0: + if (subarr != data).any(): + raise TypeError('Unsafe NumPy casting, you must ' + 'explicitly cast') + + subarr = subarr.view(cls) + subarr.name = name + return subarr + + @property + def inferred_type(self): + return 'integer' + + @property + def _constructor(self): + return Int64Index + + @property + def dtype(self): + return np.dtype('int64') + + @property + def is_all_dates(self): + """ + Checks that all the labels are datetime objects + """ + return False + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self is other: + return True + + # if not isinstance(other, Int64Index): + # return False + + return np.array_equal(self, other) + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + return Int64Index(joined, name=name) + + + + +class MultiIndex(Index): + """ + Implements multi-level, a.k.a. hierarchical, index object for pandas + objects + + Parameters + ---------- + levels : list or tuple of arrays + The unique labels for each level + labels : list or tuple of arrays + Integers for each level designating which label at each location + """ + # shadow property + names = None + + def __new__(cls, levels=None, labels=None, sortorder=None, names=None): + assert(len(levels) == len(labels)) + if len(levels) == 0: + raise Exception('Must pass non-zero number of levels/labels') + + if len(levels) == 1: + if names: + name = names[0] + else: + name = None + + return Index(levels[0], name=name).take(labels[0]) + + levels = [_ensure_index(lev) for lev in levels] + labels = [np.asarray(labs, dtype=np.int_) for labs in labels] + + # v3, 0.8.0 + subarr = np.empty(0, dtype=object).view(cls) + subarr.levels = levels + subarr.labels = labels + + if names is None: + subarr.names = [None] * subarr.nlevels + else: + assert(len(names) == subarr.nlevels) + subarr.names = list(names) + + # set the name + for i, name in enumerate(subarr.names): + subarr.levels[i].name = name + + if sortorder is not None: + subarr.sortorder = int(sortorder) + else: + subarr.sortorder = sortorder + + return subarr + + def copy(self, order='C'): + """ + Overridden ndarray.copy to copy over attributes + + Returns + ------- + cp : Index + Returns view on same base ndarray + """ + cp = self.view(np.ndarray).view(type(self)) + cp.levels = list(self.levels) + cp.labels = list(self.labels) + cp.names = list(self.names) + cp.sortorder = self.sortorder + return cp + + def _array_values(self): + # hack for various methods + return self.values + + @property + def dtype(self): + return np.dtype('O') + + def __repr__(self): + output = 'MultiIndex\n%s' + + options = np.get_printoptions() + np.set_printoptions(threshold=50) + + if len(self) > 100: + values = np.concatenate([self[:50].values, + self[-50:].values]) + else: + values = self.values + summary = np.array2string(values, max_line_width=70) + + np.set_printoptions(threshold=options['threshold']) + + return output % summary + + def __len__(self): + return len(self.labels[0]) + + @property + def _constructor(self): + return MultiIndex.from_tuples + + @cache_readonly + def inferred_type(self): + return 'mixed' + + @staticmethod + def _from_elements(values, labels=None, levels=None, names=None, + sortorder=None): + index = values.view(MultiIndex) + index.levels = levels + index.labels = labels + index.names = names + index.sortorder = sortorder + return index + + def _get_level_number(self, level): + try: + count = self.names.count(level) + if count > 1: + raise Exception('The name %s occurs multiple times, use a ' + 'level number' % level) + level = self.names.index(level) + except ValueError: + if not isinstance(level, int): + raise Exception('Level %s not found' % str(level)) + elif level < 0: + level += self.nlevels + elif level >= self.nlevels: + raise ValueError('Index has only %d levels, not %d' + % (self.nlevels, level)) + return level + + _tuples = None + + @property + def values(self): + if self._is_v2: + return self.view(np.ndarray) + else: + if self._tuples is not None: + return self._tuples + + values = [ndtake(lev.values, lab) + for lev, lab in zip(self.levels, self.labels)] + + # Need to box timestamps, etc. + values = _clean_arrays(values) + self._tuples = lib.fast_zip(values) + return self._tuples + + # fml + @property + def _is_v1(self): + contents = self.view(np.ndarray) + return len(contents) > 0 and not isinstance(contents[0], tuple) + + @property + def _is_v2(self): + contents = self.view(np.ndarray) + return len(contents) > 0 and isinstance(contents[0], tuple) + + @property + def _has_complex_internals(self): + # to disable groupby tricks + return True + + @property + def has_duplicates(self): + """ + Return True if there are no unique groups + """ + # has duplicates + shape = [len(lev) for lev in self.levels] + group_index = np.zeros(len(self), dtype='i8') + for i in xrange(len(shape)): + stride = np.prod([x for x in shape[i+1:]], dtype='i8') + group_index += self.labels[i] * stride + + if len(np.unique(group_index)) < len(group_index): + return True + + return False + + def get_value(self, series, key): + # somewhat broken encapsulation + from pandas.core.indexing import _maybe_droplevels + from pandas.core.series import Series + + # Label-based + try: + return self._engine.get_value(series, key) + except KeyError, e1: + try: + # TODO: what if a level contains tuples?? + loc = self.get_loc(key) + new_values = series.values[loc] + new_index = self[loc] + new_index = _maybe_droplevels(new_index, key) + return Series(new_values, index=new_index, name=series.name) + except KeyError: + pass + + try: + return lib.get_value_at(series, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if com.is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + raise InvalidIndexError(key) + + def get_level_values(self, level): + """ + Return vector of label values for requested level, equal to the length + of the index + + Parameters + ---------- + level : int + + Returns + ------- + values : ndarray + """ + num = self._get_level_number(level) + unique_vals = self.levels[num].values + labels = self.labels[num] + return unique_vals.take(labels) + + def format(self, space=2, sparsify=True, adjoin=True, names=False): + if len(self) == 0: + return [] + + stringified_levels = [lev.format() for lev in self.levels] + + result_levels = [] + for lab, lev, name in zip(self.labels, stringified_levels, self.names): + level = [] + + if names: + level.append(str(name) if name is not None else '') + + level.extend(ndtake(np.array(lev, dtype=object), lab)) + result_levels.append(level) + + if sparsify: + # little bit of a kludge job for #1217 + result_levels = _sparsify(result_levels, + start=int(names)) + + if adjoin: + return com.adjoin(space, *result_levels).split('\n') + else: + return result_levels + + @property + def is_all_dates(self): + return False + + def is_lexsorted(self): + """ + Return True if the labels are lexicographically sorted + """ + return self.lexsort_depth == self.nlevels + + @cache_readonly + def lexsort_depth(self): + if self.sortorder is not None: + if self.sortorder == 0: + return self.nlevels + else: + return 0 + + int64_labels = [com._ensure_int64(lab) for lab in self.labels] + for k in range(self.nlevels, 0, -1): + if lib.is_lexsorted(int64_labels[:k]): + return k + + return 0 + + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=None): + """ + Convert arrays to MultiIndex + + Parameters + ---------- + arrays : list / sequence + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + """ + from pandas.core.categorical import Categorical + + if len(arrays) == 1: + name = None if names is None else names[0] + return Index(arrays[0], name=name) + + cats = [Categorical.from_array(arr) for arr in arrays] + levels = [c.levels for c in cats] + labels = [c.labels for c in cats] + if names is None: + names = [c.name for c in cats] + + return MultiIndex(levels=levels, labels=labels, + sortorder=sortorder, names=names) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex + + Parameters + ---------- + tuples : array-like + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + """ + if len(tuples) == 0: + raise Exception('Cannot infer number of levels from empty list') + + if isinstance(tuples, np.ndarray): + if isinstance(tuples, Index): + tuples = tuples.values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = zip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, + names=names) + + @property + def nlevels(self): + return len(self.levels) + + @property + def levshape(self): + return tuple(len(x) for x in self.levels) + + def __contains__(self, key): + hash(key) + # work around some kind of odd cython bug + try: + self.get_loc(key) + return True + except KeyError: + return False + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = (self.levels, self.labels, self.sortorder, self.names) + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + nd_state, own_state = state + np.ndarray.__setstate__(self, nd_state) + levels, labels, sortorder, names = own_state + + self.levels = [Index(x) for x in levels] + self.labels = labels + self.names = names + self.sortorder = sortorder + + def __getitem__(self, key): + if np.isscalar(key): + return tuple(lev[lab[key]] + for lev, lab in zip(self.levels, self.labels)) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + sortorder = self.sortorder + else: + # cannot be sure whether the result will be sorted + sortorder = None + + result = np.empty(0, dtype=object).view(type(self)) + new_labels = [lab[key] for lab in self.labels] + + # an optimization + result.levels = list(self.levels) + result.labels = new_labels + result.sortorder = sortorder + result.names = self.names + + return result + + def take(self, indexer, axis=None): + """ + Analogous to ndarray.take + """ + indexer = com._ensure_platform_int(indexer) + new_labels = [lab.take(indexer) for lab in self.labels] + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names) + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + if isinstance(other, (list, tuple)): + to_concat = (self.values,) + tuple(k.values for k in other) + else: + to_concat = self.values, other.values + new_tuples = np.concatenate(to_concat) + return MultiIndex.from_tuples(new_tuples, names=self.names) + + def argsort(self, *args, **kwargs): + return self.values.argsort() + + def drop(self, labels, level=None): + """ + Make new MultiIndex with passed list of labels deleted + + Parameters + ---------- + labels : array-like + Must be a list of tuples + level : int or name, default None + + Returns + ------- + dropped : MultiIndex + """ + if level is not None: + return self._drop_from_level(labels, level) + + try: + if not isinstance(labels, np.ndarray): + labels = com._index_labels_to_array(labels) + indexer = self.get_indexer(labels) + mask = indexer == -1 + if mask.any(): + raise ValueError('labels %s not contained in axis' + % labels[mask]) + return self.delete(indexer) + except Exception: + pass + + inds = [] + for label in labels: + loc = self.get_loc(label) + if isinstance(loc, int): + inds.append(loc) + else: + inds.extend(range(loc.start, loc.stop)) + + return self.delete(inds) + + def _drop_from_level(self, labels, level): + labels = com._index_labels_to_array(labels) + i = self._get_level_number(level) + index = self.levels[i] + values = index.get_indexer(labels) + + mask = -lib.ismember(self.labels[i], set(values)) + + return self[mask] + + def droplevel(self, level=0): + """ + Return Index with requested level removed. If MultiIndex has only 2 + levels, the result will be of Index type not MultiIndex. + + Parameters + ---------- + level : int/level name or list thereof + + Notes + ----- + Does not check if result index is unique or not + + Returns + ------- + index : Index or MultiIndex + """ + levels = level + if not isinstance(levels, (tuple, list)): + levels = [level] + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + levnums = sorted(self._get_level_number(lev) for lev in levels)[::-1] + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + result = new_levels[0].take(new_labels[0]) + result.name = new_names[0] + return result + else: + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + + def swaplevel(self, i, j): + """ + Swap level i with level j. Do not change the ordering of anything + + Returns + ------- + swapped : MultiIndex + """ + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + i = self._get_level_number(i) + j = self._get_level_number(j) + + new_levels[i], new_levels[j] = new_levels[j], new_levels[i] + new_labels[i], new_labels[j] = new_labels[j], new_labels[i] + new_names[i], new_names[j] = new_names[j], new_names[i] + + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + + def reorder_levels(self, order): + """ + Rearrange levels using input order. May not drop or duplicate levels + + Parameters + ---------- + """ + order = [self._get_level_number(i) for i in order] + assert(len(order) == self.nlevels) + new_levels = [self.levels[i] for i in order] + new_labels = [self.labels[i] for i in order] + new_names = [self.names[i] for i in order] + + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + + def __getslice__(self, i, j): + return self.__getitem__(slice(i, j)) + + def sortlevel(self, level=0, ascending=True): + """ + Sort MultiIndex at the requested level. The result will respect the + original ordering of the associated factor at that level. + + Parameters + ---------- + level : int or str, default 0 + If a string is given, must be a name of the level + ascending : boolean, default True + False to sort in descending order + + Returns + ------- + sorted_index : MultiIndex + """ + from pandas.core.groupby import _indexer_from_factorized + + labels = list(self.labels) + + level = self._get_level_number(level) + primary = labels.pop(level) + + shape = list(self.levshape) + primshp = shape.pop(level) + + indexer = _indexer_from_factorized((primary,) + tuple(labels), + (primshp,) + tuple(shape), + compress=False) + if not ascending: + indexer = indexer[::-1] + + indexer = com._ensure_platform_int(indexer) + new_labels = [lab.take(indexer) for lab in self.labels] + + new_index = MultiIndex(labels=new_labels, levels=self.levels, + names=self.names, sortorder=level) + + return new_index, indexer + + def get_indexer(self, target, method=None, limit=None): + """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. The mask determines whether labels are + found or not in the current index + + Parameters + ---------- + target : MultiIndex or Index (of tuples) + method : {'pad', 'ffill', 'backfill', 'bfill'} + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Notes + ----- + This is a low-level method and probably should be used at your own risk + + Examples + -------- + >>> indexer, mask = index.get_indexer(new_index) + >>> new_values = cur_values.take(indexer) + >>> new_values[-mask] = np.nan + + Returns + ------- + (indexer, mask) : (ndarray, ndarray) + """ + method = self._get_method(method) + + target = _ensure_index(target) + + target_index = target + if isinstance(target, MultiIndex): + target_index = target._tuple_index + + if target_index.dtype != object: + return np.ones(len(target_index)) * -1 + + self_index = self._tuple_index + + if method == 'pad': + assert(self.is_unique and self.is_monotonic) + indexer = self_index._engine.get_pad_indexer(target_index, + limit=limit) + elif method == 'backfill': + assert(self.is_unique and self.is_monotonic) + indexer = self_index._engine.get_backfill_indexer(target_index, + limit=limit) + else: + indexer = self_index._engine.get_indexer(target_index) + + return com._ensure_platform_int(indexer) + + def reindex(self, target, method=None, level=None, limit=None): + """ + Performs any necessary conversion on the input index and calls + get_indexer. This method is here so MultiIndex and an Index of + like-labeled tuples can play nice together + + Returns + ------- + (new_index, indexer, mask) : (MultiIndex, ndarray, ndarray) + """ + if level is not None: + if method is not None: + raise ValueError('Fill method not supported if level passed') + target, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) + else: + if self.equals(target): + indexer = None + else: + indexer = self.get_indexer(target, method=method, + limit=limit) + + if not isinstance(target, MultiIndex): + if indexer is None: + target = self + elif (indexer >= 0).all(): + target = self.take(indexer) + else: + # hopefully? + target = MultiIndex.from_tuples(target) + + return target, indexer + + @cache_readonly + def _tuple_index(self): + """ + Convert MultiIndex to an Index of tuples + + Returns + ------- + index : Index + """ + return Index(self.values) + + def slice_locs(self, start=None, end=None, strict=False): + """ + For an ordered MultiIndex, compute the slice locations for input + labels. They can tuples representing partial levels, e.g. for a + MultiIndex with 3 levels, you can pass a single value (corresponding to + the first level), or a 1-, 2-, or 3-tuple. + + Parameters + ---------- + start : label or tuple, default None + If None, defaults to the beginning + end : label or tuple + If None, defaults to the end + strict : boolean, + + Returns + ------- + (begin, end) : (int, int) + + Notes + ----- + This function assumes that the data is sorted by the first level + """ + if start is None: + start_slice = 0 + else: + if not isinstance(start, tuple): + start = start, + start_slice = self._partial_tup_index(start, side='left') + + if end is None: + end_slice = len(self) + else: + if not isinstance(end, tuple): + end = end, + end_slice = self._partial_tup_index(end, side='right') + + return start_slice, end_slice + + def _partial_tup_index(self, tup, side='left'): + if len(tup) > self.lexsort_depth: + raise KeyError('MultiIndex lexsort depth %d, key was length %d' % + (self.lexsort_depth, len(tup))) + + n = len(tup) + start, end = 0, len(self) + zipped = izip(tup, self.levels, self.labels) + for k, (lab, lev, labs) in enumerate(zipped): + section = labs[start:end] + + if lab not in lev: + if not lev.is_type_compatible(lib.infer_dtype([lab])): + raise Exception('Level type mismatch: %s' % lab) + + # short circuit + loc = lev.searchsorted(lab, side=side) + if side == 'right' and loc >= 0: + loc -= 1 + return start + section.searchsorted(loc, side=side) + + idx = lev.get_loc(lab) + if k < n - 1: + end = start + section.searchsorted(idx, side='right') + start = start + section.searchsorted(idx, side='left') + else: + return start + section.searchsorted(idx, side=side) + + def get_loc(self, key): + """ + Get integer location slice for requested label or tuple + + Parameters + ---------- + key : label or tuple + + Returns + ------- + loc : int or slice object + """ + if isinstance(key, tuple): + if len(key) == self.nlevels: + return self._engine.get_loc(key) + else: + # partial selection + result = slice(*self.slice_locs(key, key)) + if result.start == result.stop: + raise KeyError(key) + return result + else: + return self._get_level_indexer(key, level=0) + + def get_loc_level(self, key, level=0): + """ + Get integer location slice for requested label or tuple + + Parameters + ---------- + key : label or tuple + + Returns + ------- + loc : int or slice object + """ + def _drop_levels(indexer, levels): + # kludgearound + new_index = self[indexer] + levels = [self._get_level_number(i) for i in levels] + for i in sorted(levels, reverse=True): + new_index = new_index.droplevel(i) + return new_index + + if isinstance(level, (tuple, list)): + assert(len(key) == len(level)) + result = None + for lev, k in zip(level, key): + loc, new_index = self.get_loc_level(k, level=lev) + if isinstance(loc, slice): + mask = np.zeros(len(self), dtype=bool) + mask[loc] = True + loc = mask + + result = loc if result is None else result & loc + return result, _drop_levels(result, level) + + level = self._get_level_number(level) + + if isinstance(key, tuple) and level == 0: + try: + if key in self.levels[0]: + indexer = self._get_level_indexer(key, level=level) + new_index = _drop_levels(indexer, [0]) + return indexer, new_index + except TypeError: + pass + + if not any(isinstance(k, slice) for k in key): + if len(key) == self.nlevels: + return self._engine.get_loc(key), None + else: + # partial selection + indexer = slice(*self.slice_locs(key, key)) + if indexer.start == indexer.stop: + raise KeyError(key) + ilevels = [i for i in range(len(key)) + if key[i] != slice(None, None)] + return indexer, _drop_levels(indexer, ilevels) + else: + indexer = None + for i, k in enumerate(key): + if not isinstance(k, slice): + k = self._get_level_indexer(k, level=i) + if isinstance(k, slice): + # everything + if k.start == 0 and k.stop == len(self): + k = slice(None, None) + else: + k_index = k + + if isinstance(k, slice): + if k == slice(None, None): + continue + else: + raise TypeError(key) + + if indexer is None: + indexer = k_index + else: # pragma: no cover + indexer &= k_index + if indexer is None: + indexer = slice(None, None) + ilevels = [i for i in range(len(key)) + if key[i] != slice(None, None)] + return indexer, _drop_levels(indexer, ilevels) + else: + indexer = self._get_level_indexer(key, level=level) + new_index = _drop_levels(indexer, [level]) + return indexer, new_index + + def _get_level_indexer(self, key, level=0): + level_index = self.levels[level] + loc = level_index.get_loc(key) + labels = self.labels[level] + + if level > 0 or self.lexsort_depth == 0: + return labels == loc + else: + # sorted, so can return slice object -> view + i = labels.searchsorted(loc, side='left') + j = labels.searchsorted(loc, side='right') + return slice(i, j) + + def truncate(self, before=None, after=None): + """ + Slice index between two labels / tuples, return new MultiIndex + + Parameters + ---------- + before : label or tuple, can be partial. Default None + None defaults to start + after : label or tuple, can be partial. Default None + None defaults to end + + Returns + ------- + truncated : MultiIndex + """ + if after and before and after < before: + raise ValueError('after < before') + + i, j = self.levels[0].slice_locs(before, after) + left, right = self.slice_locs(before, after) + + new_levels = list(self.levels) + new_levels[0] = new_levels[0][i:j] + + new_labels = [lab[left:right] for lab in self.labels] + new_labels[0] = new_labels[0] - i + + return MultiIndex(levels=new_levels, labels=new_labels) + + def equals(self, other): + """ + Determines if two MultiIndex objects have the same labeling information + (the levels themselves do not necessarily have to be the same) + + See also + -------- + equal_levels + """ + if self is other: + return True + + if not isinstance(other, MultiIndex): + return np.array_equal(self.values, _ensure_index(other)) + + if self.nlevels != other.nlevels: + return False + + if len(self) != len(other): + return False + + for i in xrange(self.nlevels): + svalues = ndtake(self.levels[i].values, self.labels[i]) + ovalues = ndtake(other.levels[i].values, other.labels[i]) + if not np.array_equal(svalues, ovalues): + return False + + return True + + def equal_levels(self, other): + """ + Return True if the levels of both MultiIndex objects are the same + + """ + if self.nlevels != other.nlevels: + return False + + for i in xrange(self.nlevels): + if not self.levels[i].equals(other.levels[i]): + return False + return True + + def union(self, other): + """ + Form the union of two MultiIndex objects, sorting if possible + + Parameters + ---------- + other : MultiIndex or array / Index of tuples + + Returns + ------- + Index + """ + self._assert_can_do_setop(other) + + if len(other) == 0 or self.equals(other): + return self + + result_names = self.names if self.names == other.names else None + + uniq_tuples = lib.fast_unique_multiple([self.values, other.values]) + return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, + names=result_names) + + def intersection(self, other): + """ + Form the intersection of two MultiIndex objects, sorting if possible + + Parameters + ---------- + other : MultiIndex or array / Index of tuples + + Returns + ------- + Index + """ + self._assert_can_do_setop(other) + + if self.equals(other): + return self + + result_names = self.names if self.names == other.names else None + + self_tuples = self.values + other_tuples = other.values + uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) + if len(uniq_tuples) == 0: + return MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + names=result_names) + else: + return MultiIndex.from_arrays(zip(*uniq_tuples), sortorder=0, + names=result_names) + + def diff(self, other): + """ + Compute sorted set difference of two MultiIndex objects + + Returns + ------- + diff : MultiIndex + """ + self._assert_can_do_setop(other) + + result_names = self.names if self.names == other.names else None + + if self.equals(other): + return MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + names=result_names) + + difference = sorted(set(self.values) - set(other.values)) + + if len(difference) == 0: + return MultiIndex(levels=[[]] * self.nlevels, + labels=[[]] * self.nlevels, + names=result_names) + else: + return MultiIndex.from_tuples(difference, sortorder=0, + names=result_names) + + def _assert_can_do_setop(self, other): + if not isinstance(other, MultiIndex): + raise TypeError('can only call with other hierarchical ' + 'index objects') + + assert(self.nlevels == other.nlevels) + + def insert(self, loc, item): + """ + Make new MultiIndex inserting new item at location + + Parameters + ---------- + loc : int + item : tuple + Must be same length as number of levels in the MultiIndex + + Returns + ------- + new_index : Index + """ + if not isinstance(item, tuple) or len(item) != self.nlevels: + raise Exception("%s cannot be inserted in this MultIndex" + % str(item)) + + new_levels = [] + new_labels = [] + for k, level, labels in zip(item, self.levels, self.labels): + if k not in level: + # have to insert into level + # must insert at end otherwise you have to recompute all the + # other labels + lev_loc = len(level) + level = level.insert(lev_loc, k) + else: + lev_loc = level.get_loc(k) + + new_levels.append(level) + new_labels.append(np.insert(labels, loc, lev_loc)) + + return MultiIndex(levels=new_levels, labels=new_labels, + names=self.names) + + def delete(self, loc): + """ + Make new index with passed location deleted + + Returns + ------- + new_index : MultiIndex + """ + new_labels = [np.delete(lab, loc) for lab in self.labels] + return MultiIndex(levels=self.levels, labels=new_labels, + names=self.names) + + get_major_bounds = slice_locs + + __bounds = None + + @property + def _bounds(self): + """ + Return or compute and return slice points for level 0, assuming + sortedness + """ + if self.__bounds is None: + inds = np.arange(len(self.levels[0])) + self.__bounds = self.labels[0].searchsorted(inds) + + return self.__bounds + + def _wrap_joined_index(self, joined, other): + names = self.names if self.names == other.names else None + return MultiIndex.from_tuples(joined, names=names) + + +# For utility purposes + + +def _sparsify(label_list, start=0): + pivoted = zip(*label_list) + k = len(label_list) + + result = pivoted[:start + 1] + prev = pivoted[start] + + for cur in pivoted[start + 1:]: + sparse_cur = [] + + for i, (p, t) in enumerate(zip(prev, cur)): + if i == k - 1: + sparse_cur.append(t) + result.append(sparse_cur) + break + + if p == t: + sparse_cur.append('') + else: + sparse_cur.append(t) + + prev = cur + + return zip(*result) + + +def _ensure_index(index_like): + if isinstance(index_like, Index): + return index_like + if hasattr(index_like, 'name'): + return Index(index_like, name=index_like.name) + + if isinstance(index_like, list): + if len(index_like) and isinstance(index_like[0], (list, np.ndarray)): + return MultiIndex.from_arrays(index_like) + + return Index(index_like) + +def _validate_join_method(method): + if method not in ['left', 'right', 'inner', 'outer']: + raise Exception('do not recognize join method %s' % method) + +# TODO: handle index names! +def _get_combined_index(indexes, intersect=False): + indexes = _get_distinct_indexes(indexes) + if len(indexes) == 0: + return Index([]) + if len(indexes) == 1: + return indexes[0] + if intersect: + index = indexes[0] + for other in indexes[1:]: + index = index.intersection(other) + return index + union = _union_indexes(indexes) + return _ensure_index(union) + + +def _get_distinct_indexes(indexes): + return dict((id(x), x) for x in indexes).values() + + +def _union_indexes(indexes): + assert(len(indexes) > 0) + if len(indexes) == 1: + result = indexes[0] + if isinstance(result, list): + result = Index(sorted(result)) + return result + + indexes, kind = _sanitize_and_check(indexes) + + if kind == 'special': + result = indexes[0] + for other in indexes[1:]: + result = result.union(other) + return result + elif kind == 'array': + index = indexes[0] + for other in indexes[1:]: + if not index.equals(other): + return Index(lib.fast_unique_multiple(indexes)) + + return index + else: + return Index(lib.fast_unique_multiple_list(indexes)) + + +def _trim_front(strings): + """ + Trims zeros and decimal points + """ + trimmed = strings + while len(strings) > 0 and all([x[0] == ' ' for x in trimmed]): + trimmed = [x[1:] for x in trimmed] + return trimmed + + +def _sanitize_and_check(indexes): + kinds = list(set([type(index) for index in indexes])) + + if list in kinds: + if len(kinds) > 1: + indexes = [Index(com._try_sort(x)) + if not isinstance(x, Index) else x + for x in indexes] + kinds.remove(list) + else: + return indexes, 'list' + + if len(kinds) > 1 or Index not in kinds: + return indexes, 'special' + else: + return indexes, 'array' + +def _handle_legacy_indexes(indexes): + from pandas.core.daterange import DateRange + from pandas.tseries.index import DatetimeIndex + + converted = [] + for index in indexes: + if isinstance(index, DateRange): + if len(index) == 0: + kwds = dict(data=[], freq=index.offset, tz=index.tzinfo) + else: + kwds = dict(start=index[0], end=index[-1], + freq=index.offset, tz=index.tzinfo) + + index = DatetimeIndex(**kwds) + + converted.append(index) + + return converted + +def _get_consensus_names(indexes): + consensus_name = indexes[0].names + for index in indexes[1:]: + if index.names != consensus_name: + consensus_name = [None] * index.nlevels + break + return consensus_name + +def _ensure_compat_concat(indexes): + from pandas.tseries.index import DatetimeIndex + is_m8 = [isinstance(idx, DatetimeIndex) for idx in indexes] + if any(is_m8) and not all(is_m8): + return [_maybe_box_dtindex(idx) for idx in indexes] + return indexes + +def _maybe_box_dtindex(idx): + from pandas.tseries.index import DatetimeIndex + if isinstance(idx, DatetimeIndex): + return idx.asobject + return idx + +def _clean_arrays(values): + result = [] + for arr in values: + if np.issubdtype(arr.dtype, np.datetime64): + result.append(lib.map_infer(arr, lib.Timestamp)) + else: + result.append(arr) + return result + + +def _all_indexes_same(indexes): + first = indexes[0] + for index in indexes[1:]: + if not first.equals(index): + return False + return True diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py new file mode 100644 index 00000000..5e59ab5e --- /dev/null +++ b/pandas/core/indexing.py @@ -0,0 +1,614 @@ +# pylint: disable=W0223 + +from pandas.core.common import _asarray_tuplesafe +from pandas.core.index import Index, MultiIndex +import pandas.core.common as com + +import numpy as np + +# "null slice" +_NS = slice(None, None) + + +class IndexingError(Exception): + pass + + +class _NDFrameIndexer(object): + + def __init__(self, obj): + self.obj = obj + self.ndim = obj.ndim + + def __iter__(self): + raise NotImplementedError('ix is not iterable') + + def __getitem__(self, key): + if type(key) is tuple: + try: + return self.obj.get_value(*key) + except Exception: + pass + + return self._getitem_tuple(key) + else: + return self._getitem_axis(key, axis=0) + + def _get_label(self, label, axis=0): + try: + return self.obj.xs(label, axis=axis, copy=False) + except Exception: + return self.obj.xs(label, axis=axis, copy=True) + + def _get_loc(self, key, axis=0): + return self.obj._ixs(key, axis=axis) + + def _slice(self, obj, axis=0): + return self.obj._slice(obj, axis=axis) + + def __setitem__(self, key, value): + # kludgetastic + ax = self.obj._get_axis(0) + if isinstance(ax, MultiIndex): + try: + indexer = ax.get_loc(key) + self._setitem_with_indexer(indexer, value) + return + except Exception: + pass + + if isinstance(key, tuple): + if len(key) > self.ndim: + raise IndexingError('only tuples of length <= %d supported', + self.ndim) + indexer = self._convert_tuple(key) + else: + indexer = self._convert_to_indexer(key) + + self._setitem_with_indexer(indexer, value) + + def _convert_tuple(self, key): + keyidx = [] + for i, k in enumerate(key): + idx = self._convert_to_indexer(k, axis=i) + keyidx.append(idx) + return tuple(keyidx) + + def _setitem_with_indexer(self, indexer, value): + from pandas.core.frame import DataFrame + + # also has the side effect of consolidating in-place + + # mmm, spaghetti + + if self.obj._is_mixed_type: + if not isinstance(indexer, tuple): + indexer = self._tuplify(indexer) + + het_axis = self.obj._het_axis + het_idx = indexer[het_axis] + + if isinstance(het_idx, (int, long)): + het_idx = [het_idx] + + plane_indexer = indexer[:het_axis] + indexer[het_axis + 1:] + item_labels = self.obj._get_axis(het_axis) + + if isinstance(value, (np.ndarray, DataFrame)) and value.ndim > 1: + raise ValueError('Setting mixed-type DataFrames with ' + 'array/DataFrame pieces not yet supported') + + try: + for item in item_labels[het_idx]: + data = self.obj[item] + data.values[plane_indexer] = value + except ValueError: + for item, v in zip(item_labels[het_idx], value): + data = self.obj[item] + data.values[plane_indexer] = v + else: + if isinstance(indexer, tuple): + indexer = _maybe_convert_ix(*indexer) + + if isinstance(value, DataFrame): + value = value.values + if not isinstance(self.obj, DataFrame): + value = value.T + + self.obj.values[indexer] = value + + def _getitem_tuple(self, tup): + try: + return self._getitem_lowerdim(tup) + except IndexingError: + pass + + # ugly hack for GH #836 + if self._multi_take_opportunity(tup): + return self._multi_take(tup) + + # no shortcut needed + retval = self.obj + for i, key in enumerate(tup): + if i >= self.obj.ndim: + raise IndexingError('Too many indexers') + + if _is_null_slice(key): + continue + + retval = retval.ix._getitem_axis(key, axis=i) + + return retval + + def _multi_take_opportunity(self, tup): + from pandas.core.frame import DataFrame + + # ugly hack for GH #836 + if not isinstance(self.obj, DataFrame): + return False + + if not all(_is_list_like(x) for x in tup): + return False + + # just too complicated + if (isinstance(self.obj.index, MultiIndex) or + isinstance(self.obj.columns, MultiIndex)): + return False + + return True + + def _multi_take(self, tup): + index = self._convert_for_reindex(tup[0], axis=0) + columns = self._convert_for_reindex(tup[1], axis=1) + return self.obj.reindex(index=index, columns=columns) + + def _convert_for_reindex(self, key, axis=0): + labels = self.obj._get_axis(axis) + + if com._is_bool_indexer(key): + key = _check_bool_indexer(labels, key) + return labels[np.asarray(key)] + else: + if isinstance(key, Index): + # want Index objects to pass through untouched + keyarr = key + else: + # asarray can be unsafe, NumPy strings are weird + keyarr = _asarray_tuplesafe(key) + + if _is_integer_dtype(keyarr) and not _is_integer_index(labels): + return labels.take(keyarr) + + return keyarr + + def _getitem_lowerdim(self, tup): + from pandas.core.frame import DataFrame + + ax0 = self.obj._get_axis(0) + # a bit kludgy + if isinstance(ax0, MultiIndex): + try: + return self._get_label(tup, axis=0) + except TypeError: + # slices are unhashable + pass + except Exception, e1: + if isinstance(tup[0], slice): + raise IndexingError + try: + loc = ax0.get_loc(tup[0]) + except KeyError: + raise e1 + + # to avoid wasted computation + # df.ix[d1:d2, 0] -> columns first (True) + # df.ix[0, ['C', 'B', A']] -> rows first (False) + for i, key in enumerate(tup): + if _is_label_like(key) or isinstance(key, tuple): + section = self._getitem_axis(key, axis=i) + + # might have been a MultiIndex + if section.ndim == self.ndim: + new_key = tup[:i] + (_NS,) + tup[i + 1:] + # new_key = tup[:i] + tup[i+1:] + else: + new_key = tup[:i] + tup[i + 1:] + + # unfortunately need an odious kludge here because of + # DataFrame transposing convention + if (isinstance(section, DataFrame) and i > 0 + and len(new_key) == 2): + a, b = new_key + new_key = b, a + + if len(new_key) == 1: + new_key, = new_key + + return section.ix[new_key] + + raise IndexingError('not applicable') + + def _getitem_axis(self, key, axis=0): + labels = self.obj._get_axis(axis) + if isinstance(key, slice): + return self._get_slice_axis(key, axis=axis) + elif _is_list_like(key) and not (isinstance(key, tuple) and + isinstance(labels, MultiIndex)): + + if hasattr(key, 'ndim') and key.ndim > 1: + raise ValueError('Cannot index with multidimensional key') + + return self._getitem_iterable(key, axis=axis) + elif axis == 0: + is_int_index = _is_integer_index(labels) + + idx = key + if com.is_integer(key): + if isinstance(labels, MultiIndex): + try: + return self._get_label(key, axis=0) + except (KeyError, TypeError): + if _is_integer_index(self.obj.index.levels[0]): + raise + + if not is_int_index: + return self._get_loc(key, axis=0) + + return self._get_label(idx, axis=0) + else: + labels = self.obj._get_axis(axis) + lab = key + if com.is_integer(key) and not _is_integer_index(labels): + return self._get_loc(key, axis=axis) + return self._get_label(lab, axis=axis) + + def _getitem_iterable(self, key, axis=0): + labels = self.obj._get_axis(axis) + + def _reindex(keys, level=None): + try: + return self.obj.reindex_axis(keys, axis=axis, level=level) + except AttributeError: + # Series + assert(axis == 0) + return self.obj.reindex(keys, level=level) + + if com._is_bool_indexer(key): + key = _check_bool_indexer(labels, key) + return _reindex(labels[np.asarray(key)]) + else: + if isinstance(key, Index): + # want Index objects to pass through untouched + keyarr = key + else: + # asarray can be unsafe, NumPy strings are weird + keyarr = _asarray_tuplesafe(key) + + if _is_integer_dtype(keyarr) and not _is_integer_index(labels): + return self.obj.take(keyarr, axis=axis) + + # this is not the most robust, but... + if (isinstance(labels, MultiIndex) and + not isinstance(keyarr[0], tuple)): + level = 0 + else: + level = None + + return _reindex(keyarr, level=level) + + def _convert_to_indexer(self, obj, axis=0): + """ + Convert indexing key into something we can use to do actual fancy + indexing on an ndarray + + Examples + ix[:5] -> slice(0, 5) + ix[[1,2,3]] -> [1,2,3] + ix[['foo', 'bar', 'baz']] -> [i, j, k] (indices of foo, bar, baz) + + Going by Zen of Python? + "In the face of ambiguity, refuse the temptation to guess." + raise AmbiguousIndexError with integer labels? + - No, prefer label-based indexing + """ + labels = self.obj._get_axis(axis) + is_int_index = _is_integer_index(labels) + + if com.is_integer(obj) and not is_int_index: + return obj + + try: + return labels.get_loc(obj) + except (KeyError, TypeError): + pass + + if isinstance(obj, slice): + ltype = labels.inferred_type + + if ltype == 'floating': + int_slice = _is_int_slice(obj) + else: + # floats that are within tolerance of int used + int_slice = _is_index_slice(obj) + + null_slice = obj.start is None and obj.stop is None + # could have integers in the first level of the MultiIndex + position_slice = (int_slice + and not ltype == 'integer' + and not isinstance(labels, MultiIndex)) + + start, stop = obj.start, obj.stop + + # last ditch effort: if we are mixed and have integers + try: + if 'mixed' in ltype and int_slice: + if start is not None: + i = labels.get_loc(start) + if stop is not None: + j = labels.get_loc(stop) + position_slice = False + except KeyError: + if ltype == 'mixed-integer': + raise + + if null_slice or position_slice: + slicer = obj + else: + try: + i, j = labels.slice_locs(start, stop) + if isinstance(i, slice): + i = i.start + if isinstance(j, slice): + j = j.stop + slicer = slice(i, j, obj.step) + except Exception: + if _is_index_slice(obj): + if labels.inferred_type == 'integer': + raise + slicer = obj + else: + raise + + return slicer + + elif _is_list_like(obj): + if com._is_bool_indexer(obj): + objarr = _check_bool_indexer(labels, obj) + return objarr + else: + if isinstance(obj, Index): + objarr = obj.values + else: + objarr = _asarray_tuplesafe(obj) + + # If have integer labels, defer to label-based indexing + if _is_integer_dtype(objarr) and not is_int_index: + return objarr + + # this is not the most robust, but... + if (isinstance(labels, MultiIndex) and + not isinstance(objarr[0], tuple)): + level = 0 + _, indexer = labels.reindex(objarr, level=level) + + check = labels.levels[0].get_indexer(objarr) + else: + level = None + indexer = check = labels.get_indexer(objarr) + + mask = check == -1 + if mask.any(): + raise KeyError('%s not in index' % objarr[mask]) + + return indexer + else: + return labels.get_loc(obj) + + def _tuplify(self, loc): + tup = [slice(None, None) for _ in range(self.ndim)] + tup[0] = loc + return tuple(tup) + + def _get_slice_axis(self, slice_obj, axis=0): + obj = self.obj + + axis_name = obj._get_axis_name(axis) + labels = getattr(obj, axis_name) + + int_slice = _is_index_slice(slice_obj) + + start = slice_obj.start + stop = slice_obj.stop + + # in case of providing all floats, use label-based indexing + float_slice = (labels.inferred_type == 'floating' + and _is_float_slice(slice_obj)) + + null_slice = slice_obj.start is None and slice_obj.stop is None + + # could have integers in the first level of the MultiIndex, in which + # case we wouldn't want to do position-based slicing + position_slice = (int_slice + and labels.inferred_type != 'integer' + and not isinstance(labels, MultiIndex) + and not float_slice) + + # last ditch effort: if we are mixed and have integers + try: + if 'mixed' in labels.inferred_type and int_slice: + if start is not None: + i = labels.get_loc(start) + if stop is not None: + j = labels.get_loc(stop) + position_slice = False + except KeyError: + if labels.inferred_type == 'mixed-integer': + raise + + if null_slice or position_slice: + slicer = slice_obj + else: + try: + i, j = labels.slice_locs(start, stop) + slicer = slice(i, j, slice_obj.step) + except Exception: + if _is_index_slice(slice_obj): + if labels.inferred_type == 'integer': + raise + slicer = slice_obj + else: + raise + + if not _need_slice(slice_obj): + return obj + + return self._slice(slicer, axis=axis) + +# 32-bit floating point machine epsilon +_eps = np.finfo('f4').eps + +def _is_index_slice(obj): + def _is_valid_index(x): + return (com.is_integer(x) or com.is_float(x) + and np.allclose(x, int(x), rtol=_eps, atol=0)) + + def _crit(v): + return v is None or _is_valid_index(v) + + both_none = obj.start is None and obj.stop is None + + return not both_none and (_crit(obj.start) and _crit(obj.stop)) + +def _is_int_slice(obj): + def _is_valid_index(x): + return com.is_integer(x) + + def _crit(v): + return v is None or _is_valid_index(v) + + both_none = obj.start is None and obj.stop is None + + return not both_none and (_crit(obj.start) and _crit(obj.stop)) + +def _is_float_slice(obj): + def _is_valid_index(x): + return com.is_float(x) + + def _crit(v): + return v is None or _is_valid_index(v) + + both_none = obj.start is None and obj.stop is None + + return not both_none and (_crit(obj.start) and _crit(obj.stop)) + + +class _SeriesIndexer(_NDFrameIndexer): + """ + Class to support fancy indexing, potentially using labels + + Notes + ----- + Indexing based on labels is INCLUSIVE + Slicing uses PYTHON SEMANTICS (endpoint is excluded) + + If Index contains int labels, these will be used rather than the locations, + so be very careful (ambiguous). + + Examples + -------- + >>> ts.ix[5:10] # equivalent to ts[5:10] + >>> ts.ix[[date1, date2, date3]] + >>> ts.ix[date1:date2] = 0 + """ + + def _get_label(self, key, axis=0): + return self.obj[key] + + def _get_loc(self, key, axis=0): + return self.obj.values[key] + + def _slice(self, indexer, axis=0): + return self.obj._get_values(indexer) + + def _setitem_with_indexer(self, indexer, value): + self.obj._set_values(indexer, value) + + +def _check_bool_indexer(ax, key): + # boolean indexing, need to check that the data are aligned, otherwise + # disallowed + result = key + if _is_series(key) and key.dtype == np.bool_: + if not key.index.equals(ax): + result = key.reindex(ax) + + if isinstance(result, np.ndarray) and result.dtype == np.object_: + mask = com.isnull(result) + if mask.any(): + raise IndexingError('cannot index with vector containing ' + 'NA / NaN values') + + return result + + +def _is_series(obj): + from pandas.core.series import Series + return isinstance(obj, Series) + + +def _maybe_convert_ix(*args): + """ + We likely want to take the cross-product + """ + ixify = True + for arg in args: + if not isinstance(arg, (np.ndarray, list)): + ixify = False + + if ixify: + return np.ix_(*args) + else: + return args + + +def _is_null_slice(obj): + return (isinstance(obj, slice) and obj.start is None and + obj.stop is None and obj.step is None) + + +def _is_integer_dtype(arr): + return (issubclass(arr.dtype.type, np.integer) and + not arr.dtype.type == np.datetime64) + + +def _is_integer_index(index): + return index.inferred_type == 'integer' + + +def _is_label_like(key): + # select a label or row + return not isinstance(key, slice) and not _is_list_like(key) + + +def _is_list_like(obj): + # Consider namedtuples to be not list like as they are useful as indices + return (np.iterable(obj) + and not isinstance(obj, basestring) + and not (isinstance(obj, tuple) and type(obj) is not tuple)) + + +def _need_slice(obj): + return (obj.start is not None or + obj.stop is not None or + (obj.step is not None and obj.step != 1)) + + +def _maybe_droplevels(index, key): + # drop levels + if isinstance(key, tuple): + for _ in key: + index = index.droplevel(0) + else: + index = index.droplevel(0) + + return index diff --git a/pandas/core/internals.py b/pandas/core/internals.py new file mode 100644 index 00000000..1966b51f --- /dev/null +++ b/pandas/core/internals.py @@ -0,0 +1,1432 @@ +import itertools +from datetime import datetime + +from numpy import nan +import numpy as np + +from pandas.core.index import Index, _ensure_index, _handle_legacy_indexes +import pandas.core.common as com +import pandas.lib as lib + +class Block(object): + """ + Canonical n-dimensional unit of homogeneous dtype contained in a pandas data + structure + + Index-ignorant; let the container take care of that + """ + __slots__ = ['items', 'ref_items', '_ref_locs', 'values', 'ndim'] + + def __init__(self, values, items, ref_items, ndim=2, + do_integrity_check=False): + if issubclass(values.dtype.type, basestring): + values = np.array(values, dtype=object) + + assert(values.ndim == ndim) + assert(len(items) == len(values)) + + self.values = values + self.ndim = ndim + self.items = _ensure_index(items) + self.ref_items = _ensure_index(ref_items) + + if do_integrity_check: + self._check_integrity() + + def _check_integrity(self): + if len(self.items) < 2: + return + # monotonicity + return (self.ref_locs[1:] > self.ref_locs[:-1]).all() + + _ref_locs = None + @property + def ref_locs(self): + if self._ref_locs is None: + indexer = self.ref_items.get_indexer(self.items) + indexer = com._ensure_platform_int(indexer) + assert((indexer != -1).all()) + self._ref_locs = indexer + return self._ref_locs + + def set_ref_items(self, ref_items, maybe_rename=True): + """ + If maybe_rename=True, need to set the items for this guy + """ + assert(isinstance(ref_items, Index)) + if maybe_rename: + self.items = ref_items.take(self.ref_locs) + self.ref_items = ref_items + + def __repr__(self): + shape = ' x '.join([str(s) for s in self.shape]) + name = type(self).__name__ + return '%s: %s, %s, dtype %s' % (name, self.items, shape, self.dtype) + + def __contains__(self, item): + return item in self.items + + def __len__(self): + return len(self.values) + + def __getstate__(self): + # should not pickle generally (want to share ref_items), but here for + # completeness + return (self.items, self.ref_items, self.values) + + def __setstate__(self, state): + items, ref_items, values = state + self.items = _ensure_index(items) + self.ref_items = _ensure_index(ref_items) + self.values = values + self.ndim = values.ndim + + @property + def shape(self): + return self.values.shape + + @property + def dtype(self): + return self.values.dtype + + def copy(self, deep=True): + values = self.values + if deep: + values = values.copy() + return make_block(values, self.items, self.ref_items) + + def merge(self, other): + assert(self.ref_items.equals(other.ref_items)) + + # Not sure whether to allow this or not + # if not union_ref.equals(other.ref_items): + # union_ref = self.ref_items + other.ref_items + return _merge_blocks([self, other], self.ref_items) + + def reindex_axis(self, indexer, mask, needs_masking, axis=0, + fill_value=np.nan): + """ + Reindex using pre-computed indexer information + """ + if self.values.size > 0: + new_values = com.take_fast(self.values, indexer, mask, + needs_masking, axis=axis, + fill_value=fill_value) + else: + shape = list(self.shape) + shape[axis] = len(indexer) + new_values = np.empty(shape) + new_values.fill(fill_value) + return make_block(new_values, self.items, self.ref_items) + + def reindex_items_from(self, new_ref_items, copy=True): + """ + Reindex to only those items contained in the input set of items + + E.g. if you have ['a', 'b'], and the input items is ['b', 'c', 'd'], + then the resulting items will be ['b'] + + Returns + ------- + reindexed : Block + """ + new_ref_items, indexer = self.items.reindex(new_ref_items) + if indexer is None: + new_items = new_ref_items + new_values = self.values.copy() if copy else self.values + else: + mask = indexer != -1 + masked_idx = indexer[mask] + + if self.values.ndim == 2: + new_values = com.take_2d(self.values, masked_idx, axis=0, + needs_masking=False) + else: + new_values = self.values.take(masked_idx, axis=0) + + new_items = self.items.take(masked_idx) + return make_block(new_values, new_items, new_ref_items) + + def get(self, item): + loc = self.items.get_loc(item) + return self.values[loc] + + def set(self, item, value): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + loc = self.items.get_loc(item) + self.values[loc] = value + + def delete(self, item): + """ + Returns + ------- + y : Block (new object) + """ + loc = self.items.get_loc(item) + new_items = self.items.delete(loc) + new_values = np.delete(self.values, loc, 0) + return make_block(new_values, new_items, self.ref_items) + + def split_block_at(self, item): + """ + Split block around given column, for "deleting" a column without + having to copy data by returning views on the original array + + Returns + ------- + leftb, rightb : (Block or None, Block or None) + """ + loc = self.items.get_loc(item) + + if len(self.items) == 1: + # no blocks left + return None, None + + if loc == 0: + # at front + left_block = None + right_block = make_block(self.values[1:], self.items[1:].copy(), + self.ref_items) + elif loc == len(self.values) - 1: + # at back + left_block = make_block(self.values[:-1], self.items[:-1].copy(), + self.ref_items) + right_block = None + else: + # in the middle + left_block = make_block(self.values[:loc], + self.items[:loc].copy(), self.ref_items) + right_block = make_block(self.values[loc + 1:], + self.items[loc + 1:].copy(), + self.ref_items) + + return left_block, right_block + + def fillna(self, value, inplace=False): + new_values = self.values if inplace else self.values.copy() + + mask = com.isnull(new_values) + np.putmask(new_values, mask, value) + + if inplace: + return self + else: + return make_block(new_values, self.items, self.ref_items) + + def _can_hold_element(self, value): + raise NotImplementedError() + + def _try_cast(self, value): + raise NotImplementedError() + + def replace(self, to_replace, value, inplace=False): + new_values = self.values if inplace else self.values.copy() + if self._can_hold_element(value): + value = self._try_cast(value) + + if not isinstance(to_replace, (list, np.ndarray)): + if self._can_hold_element(to_replace): + to_replace = self._try_cast(to_replace) + np.putmask(new_values, com.mask_missing(new_values, to_replace), + value) + else: + try: + to_replace = np.array(to_replace, dtype=self.dtype) + np.putmask(new_values, com.mask_missing(new_values, to_replace), + value) + except: + to_replace = np.array(to_replace, dtype=object) + for r in to_replace: + if self._can_hold_element(r): + r = self._try_cast(r) + np.putmask(new_values, com.mask_missing(new_values, to_replace), + value) + + if inplace: + return self + else: + return make_block(new_values, self.items, self.ref_items) + + def putmask(self, mask, new, inplace=False): + new_values = self.values if inplace else self.values.copy() + if self._can_hold_element(new): + new = self._try_cast(new) + np.putmask(new_values, mask, new) + if inplace: + return self + else: + return make_block(new_values, self.items, self.ref_items) + + def interpolate(self, method='pad', axis=0, inplace=False, + limit=None, missing=None): + values = self.values if inplace else self.values.copy() + + if values.ndim != 2: + raise NotImplementedError + + transf = (lambda x: x) if axis == 0 else (lambda x: x.T) + + if missing is None: + mask = None + else: # todo create faster fill func without masking + mask = _mask_missing(transf(values), missing) + + if method == 'pad': + com.pad_2d(transf(values), limit=limit, mask=mask) + else: + com.backfill_2d(transf(values), limit=limit, mask=mask) + + return make_block(values, self.items, self.ref_items) + + def take(self, indexer, axis=1, fill_value=np.nan): + assert(axis >= 1) + new_values = com.take_fast(self.values, indexer, None, + None, axis=axis, + fill_value=fill_value) + return make_block(new_values, self.items, self.ref_items) + + def get_values(self, dtype): + return self.values + +def _mask_missing(array, missing_values): + if not isinstance(missing_values, (list, np.ndarray)): + missing_values = [missing_values] + + mask = None + missing_values = np.array(missing_values, dtype=object) + if com.isnull(missing_values).any(): + mask = com.isnull(array) + missing_values = missing_values[com.notnull(missing_values)] + + for v in missing_values: + if mask is None: + mask = array == missing_values + else: + mask |= array == missing_values + return mask + +#------------------------------------------------------------------------------- +# Is this even possible? + +class FloatBlock(Block): + _can_hold_na = True + + def _can_hold_element(self, element): + return isinstance(element, (float, int)) + + def _try_cast(self, element): + try: + return float(element) + except: # pragma: no cover + return element + + def should_store(self, value): + # when inserting a column should not coerce integers to floats + # unnecessarily + return issubclass(value.dtype.type, np.floating) + +class ComplexBlock(Block): + _can_hold_na = True + + def _can_hold_element(self, element): + return isinstance(element, complex) + + def _try_cast(self, element): + try: + return complex(element) + except: # pragma: no cover + return element + + def should_store(self, value): + return issubclass(value.dtype.type, np.complexfloating) + +class IntBlock(Block): + _can_hold_na = False + + def _can_hold_element(self, element): + return com.is_integer(element) + + def _try_cast(self, element): + try: + return int(element) + except: # pragma: no cover + return element + + def should_store(self, value): + return issubclass(value.dtype.type, np.integer) + +class BoolBlock(Block): + _can_hold_na = False + + def _can_hold_element(self, element): + return isinstance(element, (int, bool)) + + def _try_cast(self, element): + try: + return bool(element) + except: # pragma: no cover + return element + + def should_store(self, value): + return issubclass(value.dtype.type, np.bool_) + +class ObjectBlock(Block): + _can_hold_na = True + + def _can_hold_element(self, element): + return True + + def _try_cast(self, element): + return element + + def should_store(self, value): + return not issubclass(value.dtype.type, + (np.integer, np.floating, np.complexfloating, + np.datetime64, np.bool_)) + +_NS_DTYPE = np.dtype('M8[ns]') + +class DatetimeBlock(Block): + _can_hold_na = True + + def __init__(self, values, items, ref_items, ndim=2, + do_integrity_check=False): + if values.dtype != _NS_DTYPE: + values = lib.cast_to_nanoseconds(values) + + Block.__init__(self, values, items, ref_items, ndim=ndim, + do_integrity_check=do_integrity_check) + + def _can_hold_element(self, element): + return com.is_integer(element) or isinstance(element, datetime) + + def _try_cast(self, element): + try: + return int(element) + except: + return element + + def should_store(self, value): + return issubclass(value.dtype.type, np.datetime64) + + def set(self, item, value): + """ + Modify Block in-place with new item value + + Returns + ------- + None + """ + loc = self.items.get_loc(item) + + if value.dtype != _NS_DTYPE: + value = lib.cast_to_nanoseconds(value) + + self.values[loc] = value + + def get_values(self, dtype): + if dtype == object: + flat_i8 = self.values.ravel().view(np.int64) + res = lib.ints_to_pydatetime(flat_i8) + return res.reshape(self.values.shape) + return self.values + + +def make_block(values, items, ref_items, do_integrity_check=False): + dtype = values.dtype + vtype = dtype.type + + if issubclass(vtype, np.floating): + klass = FloatBlock + elif issubclass(vtype, np.complexfloating): + klass = ComplexBlock + elif issubclass(vtype, np.datetime64): + klass = DatetimeBlock + elif issubclass(vtype, np.integer): + if vtype != np.int64: + values = values.astype('i8') + klass = IntBlock + elif dtype == np.bool_: + klass = BoolBlock + else: + klass = ObjectBlock + + return klass(values, items, ref_items, ndim=values.ndim, + do_integrity_check=do_integrity_check) + +# TODO: flexible with index=None and/or items=None + + +class BlockManager(object): + """ + Core internal data structure to implement DataFrame + + Manage a bunch of labeled 2D mixed-type ndarrays. Essentially it's a + lightweight blocked set of labeled data to be manipulated by the DataFrame + public API class + + Parameters + ---------- + + + Notes + ----- + This is *not* a public API class + """ + __slots__ = ['axes', 'blocks', 'ndim'] + + def __init__(self, blocks, axes, do_integrity_check=True): + self.axes = [_ensure_index(ax) for ax in axes] + self.blocks = blocks + + ndim = len(axes) + for block in blocks: + assert(ndim == block.values.ndim) + + if do_integrity_check: + self._verify_integrity() + + @classmethod + def make_empty(self): + return BlockManager([], [[], []]) + + def __nonzero__(self): + return True + + @property + def ndim(self): + return len(self.axes) + + def is_mixed_dtype(self): + counts = set() + for block in self.blocks: + counts.add(block.dtype) + if len(counts) > 1: + return True + return False + + def set_axis(self, axis, value): + cur_axis = self.axes[axis] + if len(value) != len(cur_axis): + raise Exception('Length mismatch (%d vs %d)' + % (len(value), len(cur_axis))) + self.axes[axis] = _ensure_index(value) + + if axis == 0: + for block in self.blocks: + block.set_ref_items(self.items, maybe_rename=True) + + # make items read only for now + def _get_items(self): + return self.axes[0] + items = property(fget=_get_items) + + def __getstate__(self): + block_values = [b.values for b in self.blocks] + block_items = [b.items for b in self.blocks] + axes_array = [ax for ax in self.axes] + return axes_array, block_values, block_items + + def __setstate__(self, state): + # discard anything after 3rd, support beta pickling format for a little + # while longer + ax_arrays, bvalues, bitems = state[:3] + + self.axes = [_ensure_index(ax) for ax in ax_arrays] + self.axes = _handle_legacy_indexes(self.axes) + + blocks = [] + for values, items in zip(bvalues, bitems): + blk = make_block(values, items, self.axes[0], + do_integrity_check=True) + blocks.append(blk) + self.blocks = blocks + + def __len__(self): + return len(self.items) + + def __repr__(self): + output = 'BlockManager' + for i, ax in enumerate(self.axes): + if i == 0: + output += '\nItems: %s' % ax + else: + output += '\nAxis %d: %s' % (i, ax) + + for block in self.blocks: + output += '\n%s' % repr(block) + return output + + @property + def shape(self): + return tuple(len(ax) for ax in self.axes) + + def _verify_integrity(self): + # _union_block_items(self.blocks) + mgr_shape = self.shape + for block in self.blocks: + assert(block.ref_items is self.items) + assert(block.values.shape[1:] == mgr_shape[1:]) + tot_items = sum(len(x.items) for x in self.blocks) + assert(len(self.items) == tot_items) + + def astype(self, dtype): + new_blocks = [] + for block in self.blocks: + newb = make_block(com._astype_nansafe(block.values, dtype), + block.items, block.ref_items) + new_blocks.append(newb) + + new_mgr = BlockManager(new_blocks, self.axes) + return new_mgr.consolidate() + + def is_consolidated(self): + """ + Return True if more than one block with the same dtype + """ + dtypes = [blk.dtype.type for blk in self.blocks] + return len(dtypes) == len(set(dtypes)) + + def get_numeric_data(self, copy=False, type_list=None): + """ + Parameters + ---------- + copy : boolean, default False + Whether to copy the blocks + type_list : tuple of type, default None + Numeric types by default (Float/Complex/Int but not Datetime) + """ + if type_list is None: + def filter_blocks(block): + return (isinstance(block, (IntBlock, FloatBlock, ComplexBlock)) + and not isinstance(block, DatetimeBlock)) + else: + type_list = self._get_clean_block_types(type_list) + filter_blocks = lambda block: isinstance(block, type_list) + + maybe_copy = lambda b: b.copy() if copy else b + num_blocks = [maybe_copy(b) for b in self.blocks if filter_blocks(b)] + + if len(num_blocks) == 0: + return BlockManager.make_empty() + + indexer = np.sort(np.concatenate([b.ref_locs for b in num_blocks])) + new_items = self.items.take(indexer) + + new_blocks = [] + for b in num_blocks: + b = b.copy(deep=False) + b.ref_items = new_items + new_blocks.append(b) + new_axes = list(self.axes) + new_axes[0] = new_items + return BlockManager(new_blocks, new_axes, do_integrity_check=False) + + def _get_clean_block_types(self, type_list): + if not isinstance(type_list, tuple): + try: + type_list = tuple(type_list) + except TypeError: + type_list = (type_list,) + + type_map = {int : IntBlock, float : FloatBlock, + complex : ComplexBlock, + np.datetime64 : DatetimeBlock, + datetime : DatetimeBlock, + bool : BoolBlock, + object : ObjectBlock} + + type_list = tuple([type_map.get(t, t) for t in type_list]) + return type_list + + def get_bool_data(self, copy=False): + return self.get_numeric_data(copy=copy, type_list=(BoolBlock,)) + + def get_slice(self, slobj, axis=0): + new_axes = list(self.axes) + new_axes[axis] = new_axes[axis][slobj] + + if axis == 0: + new_items = new_axes[0] + if len(self.blocks) == 1: + blk = self.blocks[0] + newb = make_block(blk.values[slobj], new_items, + new_items) + new_blocks = [newb] + else: + return self.reindex_items(new_items) + else: + new_blocks = self._slice_blocks(slobj, axis) + + return BlockManager(new_blocks, new_axes, do_integrity_check=False) + + def _slice_blocks(self, slobj, axis): + new_blocks = [] + + slicer = [slice(None, None) for _ in range(self.ndim)] + slicer[axis] = slobj + slicer = tuple(slicer) + + for block in self.blocks: + newb = make_block(block.values[slicer], block.items, + block.ref_items) + new_blocks.append(newb) + return new_blocks + + def get_series_dict(self): + # For DataFrame + return _blocks_to_series_dict(self.blocks, self.axes[1]) + + def __contains__(self, item): + return item in self.items + + @property + def nblocks(self): + return len(self.blocks) + + def copy(self, deep=True): + """ + Make deep or shallow copy of BlockManager + + Parameters + ---------- + deep : boolean, default True + If False, return shallow copy (do not copy data) + + Returns + ------- + copy : BlockManager + """ + copy_blocks = [block.copy(deep=deep) for block in self.blocks] + # copy_axes = [ax.copy() for ax in self.axes] + copy_axes = list(self.axes) + return BlockManager(copy_blocks, copy_axes, do_integrity_check=False) + + def as_matrix(self, items=None): + if len(self.blocks) == 0: + mat = np.empty(self.shape, dtype=float) + elif len(self.blocks) == 1: + blk = self.blocks[0] + if items is None or blk.items.equals(items): + # if not, then just call interleave per below + mat = blk.values + else: + mat = self.reindex_items(items).as_matrix() + else: + if items is None: + mat = self._interleave(self.items) + else: + mat = self.reindex_items(items).as_matrix() + + return mat + + def _interleave(self, items): + """ + Return ndarray from blocks with specified item order + Items must be contained in the blocks + """ + dtype = _interleaved_dtype(self.blocks) + items = _ensure_index(items) + + result = np.empty(self.shape, dtype=dtype) + itemmask = np.zeros(len(items), dtype=bool) + + # By construction, all of the item should be covered by one of the + # blocks + for block in self.blocks: + indexer = items.get_indexer(block.items) + assert((indexer != -1).all()) + result[indexer] = block.get_values(dtype) + itemmask[indexer] = 1 + assert(itemmask.all()) + return result + + def xs(self, key, axis=1, copy=True): + assert(axis >= 1) + + loc = self.axes[axis].get_loc(key) + slicer = [slice(None, None) for _ in range(self.ndim)] + slicer[axis] = loc + slicer = tuple(slicer) + + new_axes = list(self.axes) + + # could be an array indexer! + if isinstance(loc, (slice, np.ndarray)): + new_axes[axis] = new_axes[axis][loc] + else: + new_axes.pop(axis) + + new_blocks = [] + if len(self.blocks) > 1: + if not copy: + raise Exception('cannot get view of mixed-type or ' + 'non-consolidated DataFrame') + for blk in self.blocks: + newb = make_block(blk.values[slicer], blk.items, blk.ref_items) + new_blocks.append(newb) + elif len(self.blocks) == 1: + vals = self.blocks[0].values[slicer] + if copy: + vals = vals.copy() + new_blocks = [make_block(vals, self.items, self.items)] + + return BlockManager(new_blocks, new_axes) + + def fast_2d_xs(self, loc, copy=False): + """ + + """ + if len(self.blocks) == 1: + result = self.blocks[0].values[:, loc] + if copy: + result = result.copy() + return result + + if not copy: + raise Exception('cannot get view of mixed-type or ' + 'non-consolidated DataFrame') + + dtype = _interleaved_dtype(self.blocks) + + items = self.items + n = len(items) + result = np.empty(n, dtype=dtype) + for blk in self.blocks: + values = blk.values + for j, item in enumerate(blk.items): + i = items.get_loc(item) + result[i] = values[j, loc] + + return result + + def consolidate(self): + """ + Join together blocks having same dtype + + Returns + ------- + y : BlockManager + """ + if self.is_consolidated(): + return self + + new_blocks = _consolidate(self.blocks, self.items) + return BlockManager(new_blocks, self.axes) + + def _consolidate_inplace(self): + self.blocks = _consolidate(self.blocks, self.items) + + def get(self, item): + _, block = self._find_block(item) + return block.get(item) + + def iget(self, i): + item = self.items[i] + if self.items.is_unique: + return self.get(item) + else: + # ugh + inds, = (self.items == item).nonzero() + + _, block = self._find_block(item) + + binds, = (block.items == item).nonzero() + + for j, (k, b) in enumerate(zip(inds, binds)): + if i == k: + return block.values[b] + + raise Exception('Cannot have duplicate column names ' + 'split across dtypes') + + def get_scalar(self, tup): + """ + Retrieve single item + """ + item = tup[0] + _, blk = self._find_block(item) + + # this could obviously be seriously sped up in cython + item_loc = blk.items.get_loc(item), + full_loc = item_loc + tuple(ax.get_loc(x) + for ax, x in zip(self.axes[1:], tup[1:])) + return blk.values[full_loc] + + def delete(self, item): + i, _ = self._find_block(item) + loc = self.items.get_loc(item) + + new_items = self.items.delete(loc) + + self._delete_from_block(i, item) + self.set_items_norename(new_items) + + def set(self, item, value): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + if value.ndim == self.ndim - 1: + value = value.reshape((1,) + value.shape) + assert(value.shape[1:] == self.shape[1:]) + if item in self.items: + i, block = self._find_block(item) + if not block.should_store(value): + # delete from block, create and append new block + self._delete_from_block(i, item) + self._add_new_block(item, value, loc=None) + else: + block.set(item, value) + else: + # insert at end + self.insert(len(self.items), item, value) + + def insert(self, loc, item, value): + if item in self.items: + raise Exception('cannot insert %s, already exists' % item) + + new_items = self.items.insert(loc, item) + self.set_items_norename(new_items) + + # new block + self._add_new_block(item, value, loc=loc) + + if len(self.blocks) > 100: + self._consolidate_inplace() + + def set_items_norename(self, value): + value = _ensure_index(value) + self.axes[0] = value + + for block in self.blocks: + block.set_ref_items(value, maybe_rename=False) + + def _delete_from_block(self, i, item): + """ + Delete and maybe remove the whole block + """ + block = self.blocks.pop(i) + new_left, new_right = block.split_block_at(item) + + if new_left is not None: + self.blocks.append(new_left) + + if new_right is not None: + self.blocks.append(new_right) + + def _add_new_block(self, item, value, loc=None): + # Do we care about dtype at the moment? + + # hm, elaborate hack? + if loc is None: + loc = self.items.get_loc(item) + new_block = make_block(value, self.items[loc:loc+1].copy(), + self.items) + self.blocks.append(new_block) + + def _find_block(self, item): + self._check_have(item) + for i, block in enumerate(self.blocks): + if item in block: + return i, block + + def _check_have(self, item): + if item not in self.items: + raise KeyError('no item named %s' % str(item)) + + def reindex_axis(self, new_axis, method=None, axis=0, copy=True): + new_axis = _ensure_index(new_axis) + cur_axis = self.axes[axis] + + if new_axis.equals(cur_axis): + if copy: + result = self.copy(deep=True) + result.axes[axis] = new_axis + return result + else: + return self + + if axis == 0: + assert(method is None) + return self.reindex_items(new_axis) + + new_axis, indexer = cur_axis.reindex(new_axis, method) + return self.reindex_indexer(new_axis, indexer, axis=axis) + + def reindex_indexer(self, new_axis, indexer, axis=1, fill_value=np.nan): + """ + pandas-indexer with -1's only. + """ + if axis == 0: + return self._reindex_indexer_items(new_axis, indexer, fill_value) + + mask = indexer == -1 + + # TODO: deal with length-0 case? or does it fall out? + needs_masking = len(new_axis) > 0 and mask.any() + + new_blocks = [] + for block in self.blocks: + newb = block.reindex_axis(indexer, mask, needs_masking, + axis=axis, fill_value=fill_value) + new_blocks.append(newb) + + new_axes = list(self.axes) + new_axes[axis] = new_axis + return BlockManager(new_blocks, new_axes) + + def _reindex_indexer_items(self, new_items, indexer, fill_value): + # TODO: less efficient than I'd like + + item_order = com.take_1d(self.items.values, indexer) + + # keep track of what items aren't found anywhere + mask = np.zeros(len(item_order), dtype=bool) + + new_blocks = [] + for blk in self.blocks: + blk_indexer = blk.items.get_indexer(item_order) + selector = blk_indexer != -1 + # update with observed items + mask |= selector + + if not selector.any(): + continue + + new_block_items = new_items.take(selector.nonzero()[0]) + new_values = com.take_fast(blk.values, blk_indexer[selector], + None, False, axis=0) + new_blocks.append(make_block(new_values, new_block_items, + new_items)) + + if not mask.all(): + na_items = new_items[-mask] + na_block = self._make_na_block(na_items, new_items, + fill_value=fill_value) + new_blocks.append(na_block) + new_blocks = _consolidate(new_blocks, new_items) + + return BlockManager(new_blocks, [new_items] + self.axes[1:]) + + def reindex_items(self, new_items, copy=True, fill_value=np.nan): + """ + + """ + new_items = _ensure_index(new_items) + data = self + if not data.is_consolidated(): + data = data.consolidate() + return data.reindex_items(new_items) + + # TODO: this part could be faster (!) + new_items, indexer = self.items.reindex(new_items) + + # could have some pathological (MultiIndex) issues here + new_blocks = [] + if indexer is None: + for blk in self.blocks: + if copy: + new_blocks.append(blk.reindex_items_from(new_items)) + else: + blk.ref_items = new_items + new_blocks.append(blk) + else: + for block in self.blocks: + newb = block.reindex_items_from(new_items, copy=copy) + if len(newb.items) > 0: + new_blocks.append(newb) + + mask = indexer == -1 + if mask.any(): + extra_items = new_items[mask] + na_block = self._make_na_block(extra_items, new_items, + fill_value=fill_value) + new_blocks.append(na_block) + new_blocks = _consolidate(new_blocks, new_items) + + return BlockManager(new_blocks, [new_items] + self.axes[1:]) + + def _make_na_block(self, items, ref_items, fill_value=np.nan): + # TODO: infer dtypes other than float64 from fill_value + + block_shape = list(self.shape) + block_shape[0] = len(items) + + dtype = com._infer_dtype(fill_value) + block_values = np.empty(block_shape, dtype=dtype) + block_values.fill(fill_value) + na_block = make_block(block_values, items, ref_items, + do_integrity_check=True) + return na_block + + def take(self, indexer, axis=1): + if axis == 0: + raise NotImplementedError + + indexer = np.asarray(indexer, dtype='i4') + + n = len(self.axes[axis]) + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception('Indices must be nonzero and less than ' + 'the axis length') + + new_axes = list(self.axes) + new_axes[axis] = self.axes[axis].take(indexer) + new_blocks = [] + for blk in self.blocks: + new_values = com.take_fast(blk.values, indexer, + None, False, axis=axis) + newb = make_block(new_values, blk.items, self.items) + new_blocks.append(newb) + + return BlockManager(new_blocks, new_axes) + + def merge(self, other, lsuffix=None, rsuffix=None): + assert(self._is_indexed_like(other)) + + this, other = self._maybe_rename_join(other, lsuffix, rsuffix) + + cons_items = this.items + other.items + consolidated = _consolidate(this.blocks + other.blocks, cons_items) + + new_axes = list(this.axes) + new_axes[0] = cons_items + + return BlockManager(consolidated, new_axes) + + def _maybe_rename_join(self, other, lsuffix, rsuffix, copydata=True): + to_rename = self.items.intersection(other.items) + if len(to_rename) > 0: + if not lsuffix and not rsuffix: + raise Exception('columns overlap: %s' % to_rename) + + def lrenamer(x): + if x in to_rename: + return '%s%s' % (x, lsuffix) + return x + + def rrenamer(x): + if x in to_rename: + return '%s%s' % (x, rsuffix) + return x + + this = self.rename_items(lrenamer, copydata=copydata) + other = other.rename_items(rrenamer, copydata=copydata) + else: + this = self + + return this, other + + def _is_indexed_like(self, other): + """ + Check all axes except items + """ + assert(self.ndim == other.ndim) + for ax, oax in zip(self.axes[1:], other.axes[1:]): + if not ax.equals(oax): + return False + return True + + def rename_axis(self, mapper, axis=1): + new_axis = Index([mapper(x) for x in self.axes[axis]]) + assert(new_axis.is_unique) + + new_axes = list(self.axes) + new_axes[axis] = new_axis + return BlockManager(self.blocks, new_axes) + + def rename_items(self, mapper, copydata=True): + new_items = Index([mapper(x) for x in self.items]) + new_items.is_unique + + new_blocks = [] + for block in self.blocks: + newb = block.copy(deep=copydata) + newb.set_ref_items(new_items, maybe_rename=True) + new_blocks.append(newb) + new_axes = list(self.axes) + new_axes[0] = new_items + return BlockManager(new_blocks, new_axes) + + def add_prefix(self, prefix): + f = (('%s' % prefix) + '%s').__mod__ + return self.rename_items(f) + + def add_suffix(self, suffix): + f = ('%s' + ('%s' % suffix)).__mod__ + return self.rename_items(f) + + def fillna(self, value, inplace=False): + new_blocks = [b.fillna(value, inplace=inplace) + if b._can_hold_na else b + for b in self.blocks] + if inplace: + return self + return BlockManager(new_blocks, self.axes) + + def replace(self, to_replace, value, inplace=False): + new_blocks = [b.replace(to_replace, value, inplace=inplace) + for b in self.blocks] + if inplace: + return self + return BlockManager(new_blocks, self.axes) + + def _replace_list(self, src_lst, dest_lst): + sset = set(src_lst) + if any([k in sset for k in dest_lst]): + masks = {} + for s in src_lst: + masks[s] = [b.values == s for b in self.blocks] + + for s, d in zip(src_lst, dest_lst): + [b.putmask(masks[s][i], d, inplace=True) for i, b in + enumerate(self.blocks)] + else: + for s, d in zip(src_lst, dest_lst): + self.replace(s, d, inplace=True) + + return self + + @property + def block_id_vector(self): + # TODO + result = np.empty(len(self.items), dtype=int) + result.fill(-1) + + for i, blk in enumerate(self.blocks): + indexer = self.items.get_indexer(blk.items) + assert((indexer != -1).all()) + result.put(indexer, i) + + assert((result >= 0).all()) + return result + + @property + def item_dtypes(self): + result = np.empty(len(self.items), dtype='O') + mask = np.zeros(len(self.items), dtype=bool) + for i, blk in enumerate(self.blocks): + indexer = self.items.get_indexer(blk.items) + result.put(indexer, blk.values.dtype.name) + mask.put(indexer, 1) + assert(mask.all()) + return result + +def form_blocks(data, axes): + # pre-filter out items if we passed it + items = axes[0] + + if len(data) < len(items): + extra_items = items - Index(data.keys()) + else: + extra_items = [] + + # put "leftover" items in float bucket, where else? + # generalize? + float_dict = {} + complex_dict = {} + int_dict = {} + bool_dict = {} + object_dict = {} + datetime_dict = {} + for k, v in data.iteritems(): + if issubclass(v.dtype.type, np.floating): + float_dict[k] = v + elif issubclass(v.dtype.type, np.complexfloating): + complex_dict[k] = v + elif issubclass(v.dtype.type, np.datetime64): + datetime_dict[k] = v + elif issubclass(v.dtype.type, np.integer): + int_dict[k] = v + elif v.dtype == np.bool_: + bool_dict[k] = v + else: + object_dict[k] = v + + blocks = [] + if len(float_dict): + float_block = _simple_blockify(float_dict, items, np.float64) + blocks.append(float_block) + + if len(complex_dict): + complex_block = _simple_blockify(complex_dict, items, np.complex128) + blocks.append(complex_block) + + if len(int_dict): + int_block = _simple_blockify(int_dict, items, np.int64) + blocks.append(int_block) + + if len(datetime_dict): + datetime_block = _simple_blockify(datetime_dict, items, + np.dtype('M8[ns]')) + blocks.append(datetime_block) + + if len(bool_dict): + bool_block = _simple_blockify(bool_dict, items, np.bool_) + blocks.append(bool_block) + + if len(object_dict) > 0: + object_block = _simple_blockify(object_dict, items, np.object_) + blocks.append(object_block) + + if len(extra_items): + shape = (len(extra_items),) + tuple(len(x) for x in axes[1:]) + block_values = np.empty(shape, dtype=float) + block_values.fill(nan) + + na_block = make_block(block_values, extra_items, items, + do_integrity_check=True) + blocks.append(na_block) + blocks = _consolidate(blocks, items) + + return blocks + +def _simple_blockify(dct, ref_items, dtype): + block_items, values = _stack_dict(dct, ref_items, dtype) + # CHECK DTYPE? + if values.dtype != dtype: # pragma: no cover + values = values.astype(dtype) + + return make_block(values, block_items, ref_items, do_integrity_check=True) + +def _stack_dict(dct, ref_items, dtype): + from pandas.core.series import Series + + # fml + def _asarray_compat(x): + # asarray shouldn't be called on SparseSeries + if isinstance(x, Series): + return x.values + else: + return np.asarray(x) + + def _shape_compat(x): + # sparseseries + if isinstance(x, Series): + return len(x), + else: + return x.shape + + # index may box values + items = ref_items[[x in dct for x in ref_items]] + + first = dct[items[0]] + shape = (len(dct),) + _shape_compat(first) + + stacked = np.empty(shape, dtype=dtype) + for i, item in enumerate(items): + stacked[i] = _asarray_compat(dct[item]) + + # stacked = np.vstack([_asarray_compat(dct[k]) for k in items]) + return items, stacked + +def _blocks_to_series_dict(blocks, index=None): + from pandas.core.series import Series + + series_dict = {} + + for block in blocks: + for item, vec in zip(block.items, block.values): + series_dict[item] = Series(vec, index=index, name=item) + return series_dict + +def _interleaved_dtype(blocks): + from collections import defaultdict + counts = defaultdict(lambda: 0) + for x in blocks: + counts[type(x)] += 1 + + have_int = counts[IntBlock] > 0 + have_bool = counts[BoolBlock] > 0 + have_object = counts[ObjectBlock] > 0 + have_float = counts[FloatBlock] > 0 + have_complex = counts[ComplexBlock] > 0 + have_dt64 = counts[DatetimeBlock] > 0 + have_numeric = have_float or have_complex or have_int + + if (have_object or + (have_bool and have_numeric) or + (have_numeric and have_dt64)): + return np.dtype(object) + elif have_bool: + return np.dtype(bool) + elif have_int and not have_float and not have_complex: + return np.dtype('i8') + elif have_dt64 and not have_float and not have_complex: + return np.dtype('M8[ns]') + elif have_complex: + return np.dtype('c16') + else: + return np.dtype('f8') + +def _consolidate(blocks, items): + """ + Merge blocks having same dtype + """ + get_dtype = lambda x: x.dtype.name + + # sort by dtype + grouper = itertools.groupby(sorted(blocks, key=get_dtype), + lambda x: x.dtype) + + new_blocks = [] + for dtype, group_blocks in grouper: + new_block = _merge_blocks(list(group_blocks), items) + new_blocks.append(new_block) + + return new_blocks + + +# TODO: this could be much optimized + +def _merge_blocks(blocks, items): + if len(blocks) == 1: + return blocks[0] + new_values = _vstack([b.values for b in blocks]) + new_items = blocks[0].items.append([b.items for b in blocks[1:]]) + new_block = make_block(new_values, new_items, items, + do_integrity_check=True) + return new_block.reindex_items_from(items) + +def _union_block_items(blocks): + tot_len = 0 + all_items = [] + slow = False + for b in blocks: + tot_len += len(b.items) + if type(b.items) != Index: + slow = True + all_items.append(b.items) + + if slow: + the_union = _union_items_slow(all_items) + else: + the_union = Index(lib.fast_unique_multiple(all_items)) + + if tot_len > len(the_union): + raise Exception('item names overlap') + return the_union + +def _union_items_slow(all_items): + seen = None + for items in all_items: + if seen is None: + seen = items + else: + seen = seen.union(items) + return seen + +def _vstack(to_stack): + if all(x.dtype == _NS_DTYPE for x in to_stack): + # work around NumPy 1.6 bug + new_values = np.vstack([x.view('i8') for x in to_stack]) + return new_values.view(_NS_DTYPE) + else: + return np.vstack(to_stack) diff --git a/pandas/core/matrix.py b/pandas/core/matrix.py new file mode 100644 index 00000000..3d42fd93 --- /dev/null +++ b/pandas/core/matrix.py @@ -0,0 +1 @@ +from pandas.core.frame import DataFrame as DataMatrix diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py new file mode 100644 index 00000000..545cf658 --- /dev/null +++ b/pandas/core/nanops.py @@ -0,0 +1,431 @@ +import sys + +import numpy as np + +from pandas.core.common import isnull, notnull +import pandas.core.common as com +import pandas.lib as lib + +try: + import bottleneck as bn + _USE_BOTTLENECK = True +except ImportError: # pragma: no cover + _USE_BOTTLENECK = False + +def _bottleneck_switch(bn_name, alt, zero_value=None, **kwargs): + try: + bn_func = getattr(bn, bn_name) + except (AttributeError, NameError): # pragma: no cover + bn_func = None + def f(values, axis=None, skipna=True, **kwds): + if len(kwargs) > 0: + for k, v in kwargs.iteritems(): + if k not in kwds: + kwds[k] = v + try: + if zero_value is not None and values.size == 0: + if values.ndim == 1: + return 0 + else: + result_shape = values.shape[:axis] + values.shape[axis + 1:] + result = np.empty(result_shape) + result.fill(0) + return result + + if _USE_BOTTLENECK and skipna and values.dtype != np.object_: + result = bn_func(values, axis=axis, **kwds) + # prefer to treat inf/-inf as NA + if _has_infs(result): + result = alt(values, axis=axis, skipna=skipna, **kwds) + else: + result = alt(values, axis=axis, skipna=skipna, **kwds) + except Exception: + result = alt(values, axis=axis, skipna=skipna, **kwds) + + return result + + return f + +def _has_infs(result): + if isinstance(result, np.ndarray): + if result.dtype == 'f8': + return lib.has_infs_f8(result) + elif result.dtype == 'f4': + return lib.has_infs_f4(result) + else: # pragma: no cover + raise TypeError('Only suppose float32/64 here') + else: + return np.isinf(result) or np.isneginf(result) + +def nanany(values, axis=None, skipna=True): + mask = isnull(values) + + if skipna: + values = values.copy() + np.putmask(values, mask, False) + return values.any(axis) + +def nanall(values, axis=None, skipna=True): + mask = isnull(values) + + if skipna: + values = values.copy() + np.putmask(values, mask, True) + return values.all(axis) + +def _nansum(values, axis=None, skipna=True): + mask = isnull(values) + + if skipna and not issubclass(values.dtype.type, np.integer): + values = values.copy() + np.putmask(values, mask, 0) + + the_sum = values.sum(axis) + the_sum = _maybe_null_out(the_sum, axis, mask) + + return the_sum + +def _nanmean(values, axis=None, skipna=True): + mask = isnull(values) + + if skipna and not issubclass(values.dtype.type, np.integer): + values = values.copy() + np.putmask(values, mask, 0) + + the_sum = _ensure_numeric(values.sum(axis)) + count = _get_counts(mask, axis) + + if axis is not None: + the_mean = the_sum / count + ct_mask = count == 0 + if ct_mask.any(): + the_mean[ct_mask] = np.nan + else: + the_mean = the_sum / count if count > 0 else np.nan + return the_mean + +def _nanmedian(values, axis=None, skipna=True): + def get_median(x): + mask = notnull(x) + if not skipna and not mask.all(): + return np.nan + return lib.median(x[mask]) + + if values.dtype != np.float64: + values = values.astype('f8') + + if values.ndim > 1: + return np.apply_along_axis(get_median, axis, values) + else: + return get_median(values) + +def _nanvar(values, axis=None, skipna=True, ddof=1): + mask = isnull(values) + + if axis is not None: + count = (values.shape[axis] - mask.sum(axis)).astype(float) + else: + count = float(values.size - mask.sum()) + + if skipna: + values = values.copy() + np.putmask(values, mask, 0) + + X = _ensure_numeric(values.sum(axis)) + XX = _ensure_numeric((values ** 2).sum(axis)) + return np.fabs((XX - X ** 2 / count) / (count - ddof)) + +def _nanmin(values, axis=None, skipna=True): + mask = isnull(values) + if skipna and not issubclass(values.dtype.type, + (np.integer, np.datetime64)): + values = values.copy() + np.putmask(values, mask, np.inf) + # numpy 1.6.1 workaround in Python 3.x + if (values.dtype == np.object_ + and sys.version_info[0] >= 3): # pragma: no cover + import __builtin__ + if values.ndim > 1: + apply_ax = axis if axis is not None else 0 + result = np.apply_along_axis(__builtin__.min, apply_ax, values) + else: + result = __builtin__.min(values) + else: + result = values.min(axis) + + return _maybe_null_out(result, axis, mask) + +def _nanmax(values, axis=None, skipna=True): + mask = isnull(values) + if skipna and not issubclass(values.dtype.type, + (np.integer, np.datetime64)): + values = values.copy() + np.putmask(values, mask, -np.inf) + # numpy 1.6.1 workaround in Python 3.x + if (values.dtype == np.object_ + and sys.version_info[0] >= 3): # pragma: no cover + import __builtin__ + + if values.ndim > 1: + apply_ax = axis if axis is not None else 0 + result = np.apply_along_axis(__builtin__.max, apply_ax, values) + else: + result = __builtin__.max(values) + else: + result = values.max(axis) + return _maybe_null_out(result, axis, mask) + +def nanargmax(values, axis=None, skipna=True): + """ + Returns -1 in the NA case + """ + mask = -np.isfinite(values) + if not issubclass(values.dtype.type, np.integer): + values = values.copy() + np.putmask(values, mask, -np.inf) + result = values.argmax(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + +def nanargmin(values, axis=None, skipna=True): + """ + Returns -1 in the NA case + """ + mask = -np.isfinite(values) + if not issubclass(values.dtype.type, np.integer): + values = values.copy() + np.putmask(values, mask, np.inf) + result = values.argmin(axis) + result = _maybe_arg_null_out(result, axis, mask, skipna) + return result + +nansum = _bottleneck_switch('nansum', _nansum, zero_value=0) +nanmean = _bottleneck_switch('nanmean', _nanmean) +nanmedian = _bottleneck_switch('nanmedian', _nanmedian) +nanvar = _bottleneck_switch('nanvar', _nanvar, ddof=1) +nanmin = _bottleneck_switch('nanmin', _nanmin) +nanmax = _bottleneck_switch('nanmax', _nanmax) + +def nanskew(values, axis=None, skipna=True): + if not isinstance(values.dtype.type, np.floating): + values = values.astype('f8') + + mask = isnull(values) + count = _get_counts(mask, axis) + + if skipna: + values = values.copy() + np.putmask(values, mask, 0) + + A = values.sum(axis) / count + B = (values ** 2).sum(axis) / count - A ** 2 + C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B + + # floating point error + B = _zero_out_fperr(B) + C = _zero_out_fperr(C) + + result = ((np.sqrt((count ** 2 - count)) * C) / + ((count - 2) * np.sqrt(B) ** 3)) + + if isinstance(result, np.ndarray): + result = np.where(B == 0, 0, result) + result[count < 3] = np.nan + return result + else: + result = 0 if B == 0 else result + if count < 3: + return np.nan + return result + +def nankurt(values, axis=None, skipna=True): + if not isinstance(values.dtype.type, np.floating): + values = values.astype('f8') + + mask = isnull(values) + count = _get_counts(mask, axis) + + if skipna: + values = values.copy() + np.putmask(values, mask, 0) + + A = values.sum(axis) / count + B = (values ** 2).sum(axis) / count - A ** 2 + C = (values ** 3).sum(axis) / count - A ** 3 - 3 * A * B + D = (values ** 4).sum(axis) / count - A ** 4 - 6 * B * A * A - 4 * C * A + + B = _zero_out_fperr(B) + C = _zero_out_fperr(C) + D = _zero_out_fperr(D) + + result = (((count*count - 1.)*D / (B*B) - 3*((count-1.)**2)) / + ((count - 2.)*(count-3.))) + if isinstance(result, np.ndarray): + result = np.where(B == 0, 0, result) + result[count < 4] = np.nan + return result + else: + result = 0 if B == 0 else result + if count < 4: + return np.nan + return result + +def nanprod(values, axis=None, skipna=True): + mask = isnull(values) + if skipna and not issubclass(values.dtype.type, np.integer): + values = values.copy() + values[mask] = 1 + result = values.prod(axis) + return _maybe_null_out(result, axis, mask) + +def _maybe_arg_null_out(result, axis, mask, skipna): + # helper function for nanargmin/nanargmax + if axis is None: + if skipna: + if mask.all(): + result = -1 + else: + if mask.any(): + result = -1 + else: + if skipna: + na_mask = mask.all(axis) + else: + na_mask = mask.any(axis) + if na_mask.any(): + result[na_mask] = -1 + return result + +def _get_counts(mask, axis): + if axis is not None: + count = (mask.shape[axis] - mask.sum(axis)).astype(float) + else: + count = float(mask.size - mask.sum()) + + return count + +def _maybe_null_out(result, axis, mask): + if axis is not None: + null_mask = (mask.shape[axis] - mask.sum(axis)) == 0 + if null_mask.any(): + result = result.astype('f8') + result[null_mask] = np.nan + else: + null_mask = mask.size - mask.sum() + if null_mask == 0: + result = np.nan + + return result + +def _zero_out_fperr(arg): + if isinstance(arg, np.ndarray): + return np.where(np.abs(arg) < 1e-14, 0, arg) + else: + return 0 if np.abs(arg) < 1e-14 else arg + +def nancorr(a, b, method='pearson'): + """ + a, b: ndarrays + """ + assert(len(a) == len(b)) + + valid = notnull(a) & notnull(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) == 0: + return np.nan + + f = get_corr_func(method) + return f(a, b) + +def get_corr_func(method): + if method in ['kendall', 'spearman']: + from scipy.stats import kendalltau, spearmanr + + def _pearson(a, b): + return np.corrcoef(a, b)[0, 1] + def _kendall(a, b): + return kendalltau(a, b)[0] + def _spearman(a, b): + return spearmanr(a, b)[0] + + _cor_methods = { + 'pearson' : _pearson, + 'kendall' : _kendall, + 'spearman' : _spearman + } + return _cor_methods[method] + +def nancov(a, b): + assert(len(a) == len(b)) + + valid = notnull(a) & notnull(b) + if not valid.all(): + a = a[valid] + b = b[valid] + + if len(a) == 0: + return np.nan + + return np.cov(a, b)[0, 1] + +def _ensure_numeric(x): + if isinstance(x, np.ndarray): + if x.dtype == np.object_: + x = x.astype(np.float64) + elif not (com.is_float(x) or com.is_integer(x)): + try: + x = float(x) + except Exception: + raise TypeError('Could not convert %s to numeric' % str(x)) + + return x + +# NA-friendly array comparisons + +import operator + +def make_nancomp(op): + def f(x, y): + xmask = isnull(x) + ymask = isnull(y) + mask = xmask | ymask + + result = op(x, y) + + if mask.any(): + if result.dtype == np.bool_: + result = result.astype('O') + np.putmask(result, mask, np.nan) + + return result + return f + +nangt = make_nancomp(operator.gt) +nange = make_nancomp(operator.ge) +nanlt = make_nancomp(operator.lt) +nanle = make_nancomp(operator.le) +naneq = make_nancomp(operator.eq) +nanne = make_nancomp(operator.ne) + +def unique1d(values): + """ + Hash table-based unique + """ + if np.issubdtype(values.dtype, np.floating): + table = lib.Float64HashTable(len(values)) + uniques = np.array(table.unique(com._ensure_float64(values)), + dtype=np.float64) + elif np.issubdtype(values.dtype, np.datetime64): + table = lib.Int64HashTable(len(values)) + uniques = table.unique(com._ensure_int64(values)) + uniques = uniques.view('M8[ns]') + elif np.issubdtype(values.dtype, np.integer): + table = lib.Int64HashTable(len(values)) + uniques = table.unique(com._ensure_int64(values)) + else: + table = lib.PyObjectHashTable(len(values)) + uniques = table.unique(com._ensure_object(values)) + return uniques diff --git a/pandas/core/panel.py b/pandas/core/panel.py new file mode 100644 index 00000000..ede50eb8 --- /dev/null +++ b/pandas/core/panel.py @@ -0,0 +1,1363 @@ +""" +Contains data structures designed for manipulating panel (3-dimensional) data +""" +# pylint: disable=E1103,W0231,W0212,W0621 + +import operator +import sys +import numpy as np + +from pandas.core.common import (PandasError, _mut_exclusive, + _try_sort, _default_index, _infer_dtype) +from pandas.core.categorical import Factor +from pandas.core.index import (Index, MultiIndex, _ensure_index, + _get_combined_index) +from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels +from pandas.core.internals import BlockManager, make_block, form_blocks +from pandas.core.frame import DataFrame +from pandas.core.generic import NDFrame +from pandas.util import py3compat +from pandas.util.decorators import deprecate, Appender, Substitution +import pandas.core.common as com +import pandas.core.nanops as nanops +import pandas.lib as lib + + +def _ensure_like_indices(time, panels): + """ + Makes sure that time and panels are conformable + """ + n_time = len(time) + n_panel = len(panels) + u_panels = np.unique(panels) # this sorts! + u_time = np.unique(time) + if len(u_time) == n_time: + time = np.tile(u_time, len(u_panels)) + if len(u_panels) == n_panel: + panels = np.repeat(u_panels, len(u_time)) + return time, panels + +def panel_index(time, panels, names=['time', 'panel']): + """ + Returns a multi-index suitable for a panel-like DataFrame + + Parameters + ---------- + time : array-like + Time index, does not have to repeat + panels : array-like + Panel index, does not have to repeat + names : list, optional + List containing the names of the indices + + Returns + ------- + multi_index : MultiIndex + Time index is the first level, the panels are the second level. + + Examples + -------- + >>> years = range(1960,1963) + >>> panels = ['A', 'B', 'C'] + >>> panel_idx = panel_index(years, panels) + >>> panel_idx + MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'), + (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'), + (1962, 'C')], dtype=object) + + or + + >>> import numpy as np + >>> years = np.repeat(range(1960,1963), 3) + >>> panels = np.tile(['A', 'B', 'C'], 3) + >>> panel_idx = panel_index(years, panels) + >>> panel_idx + MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'), + (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'), + (1962, 'C')], dtype=object) + """ + time, panels = _ensure_like_indices(time, panels) + time_factor = Factor.from_array(time) + panel_factor = Factor.from_array(panels) + + labels = [time_factor.labels, panel_factor.labels] + levels = [time_factor.levels, panel_factor.levels] + return MultiIndex(levels, labels, sortorder=None, names=names) + +class PanelError(Exception): + pass + +def _arith_method(func, name): + # work only for scalars + + def f(self, other): + if not np.isscalar(other): + raise ValueError('Simple arithmetic with Panel can only be ' + 'done with scalar values') + + return self._combine(other, func) + f.__name__ = name + return f + +def _panel_arith_method(op, name): + @Substitution(op) + def f(self, other, axis='items'): + """ + Wrapper method for %s + + Parameters + ---------- + other : DataFrame or Panel class + axis : {'items', 'major', 'minor'} + Axis to broadcast over + + Returns + ------- + Panel + """ + return self._combine(other, op, axis=axis) + + f.__name__ = name + return f + + +_agg_doc = """ +Return %(desc)s over requested axis + +Parameters +---------- +axis : {'items', 'major', 'minor'} or {0, 1, 2} +skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA + +Returns +------- +%(outname)s : DataFrame +""" + +_na_info = """ + +NA/null values are %s. +If all values are NA, result will be NA""" + + +class Panel(NDFrame): + _AXIS_NUMBERS = { + 'items' : 0, + 'major_axis' : 1, + 'minor_axis' : 2 + } + + _AXIS_ALIASES = { + 'major' : 'major_axis', + 'minor' : 'minor_axis' + } + + _AXIS_NAMES = { + 0 : 'items', + 1 : 'major_axis', + 2 : 'minor_axis' + } + + # major + _default_stat_axis = 1 + _het_axis = 0 + + items = lib.AxisProperty(0) + major_axis = lib.AxisProperty(1) + minor_axis = lib.AxisProperty(2) + + __add__ = _arith_method(operator.add, '__add__') + __sub__ = _arith_method(operator.sub, '__sub__') + __truediv__ = _arith_method(operator.truediv, '__truediv__') + __floordiv__ = _arith_method(operator.floordiv, '__floordiv__') + __mul__ = _arith_method(operator.mul, '__mul__') + __pow__ = _arith_method(operator.pow, '__pow__') + + __radd__ = _arith_method(operator.add, '__radd__') + __rmul__ = _arith_method(operator.mul, '__rmul__') + __rsub__ = _arith_method(lambda x, y: y - x, '__rsub__') + __rtruediv__ = _arith_method(lambda x, y: y / x, '__rtruediv__') + __rfloordiv__ = _arith_method(lambda x, y: y // x, '__rfloordiv__') + __rpow__ = _arith_method(lambda x, y: y ** x, '__rpow__') + + if not py3compat.PY3: + __div__ = _arith_method(operator.div, '__div__') + __rdiv__ = _arith_method(lambda x, y: y / x, '__rdiv__') + + def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, + copy=False, dtype=None): + """ + Represents wide format panel data, stored as 3-dimensional array + + Parameters + ---------- + data : ndarray (items x major x minor), or dict of DataFrames + items : Index or array-like + axis=1 + major_axis : Index or array-like + axis=1 + minor_axis : Index or array-like + axis=2 + dtype : dtype, default None + Data type to force, otherwise infer + copy : boolean, default False + Copy data from inputs. Only affects DataFrame / 2d ndarray input + """ + if data is None: + data = {} + + passed_axes = [items, major_axis, minor_axis] + axes = None + if isinstance(data, BlockManager): + if any(x is not None for x in passed_axes): + axes = [x if x is not None else y + for x, y in zip(passed_axes, data.axes)] + mgr = data + elif isinstance(data, dict): + mgr = self._init_dict(data, passed_axes, dtype=dtype) + copy = False + dtype = None + elif isinstance(data, (np.ndarray, list)): + mgr = self._init_matrix(data, passed_axes, dtype=dtype, copy=copy) + copy = False + dtype = None + else: # pragma: no cover + raise PandasError('Panel constructor not properly called!') + + NDFrame.__init__(self, mgr, axes=axes, copy=copy, dtype=dtype) + + @classmethod + def _from_axes(cls, data, axes): + # for construction from BlockManager + if isinstance(data, BlockManager): + return cls(data) + else: + items, major, minor = axes + return cls(data, items=items, major_axis=major, + minor_axis=minor, copy=False) + + def _init_dict(self, data, axes, dtype=None): + items, major, minor = axes + + # prefilter if items passed + if items is not None: + items = _ensure_index(items) + data = dict((k, v) for k, v in data.iteritems() if k in items) + else: + items = Index(_try_sort(data.keys())) + + for k, v in data.iteritems(): + if isinstance(v, dict): + data[k] = DataFrame(v) + + if major is None: + major = _extract_axis(data, axis=0) + + if minor is None: + minor = _extract_axis(data, axis=1) + + axes = [items, major, minor] + reshaped_data = data.copy() # shallow + + item_shape = len(major), len(minor) + for item in items: + v = values = data.get(item) + if v is None: + values = np.empty(item_shape, dtype=dtype) + values.fill(np.nan) + elif isinstance(v, DataFrame): + v = v.reindex(index=major, columns=minor, copy=False) + if dtype is not None: + v = v.astype(dtype) + values = v.values + reshaped_data[item] = values + + # segregates dtypes and forms blocks matching to columns + blocks = form_blocks(reshaped_data, axes) + mgr = BlockManager(blocks, axes).consolidate() + return mgr + + @property + def shape(self): + return len(self.items), len(self.major_axis), len(self.minor_axis) + + @classmethod + def from_dict(cls, data, intersect=False, orient='items', dtype=None): + """ + Construct Panel from dict of DataFrame objects + + Parameters + ---------- + data : dict + {field : DataFrame} + intersect : boolean + Intersect indexes of input DataFrames + orient : {'items', 'minor'}, default 'items' + The "orientation" of the data. If the keys of the passed dict + should be the items of the result panel, pass 'items' + (default). Otherwise if the columns of the values of the passed + DataFrame objects should be the items (which in the case of + mixed-dtype data you should do), instead pass 'minor' + + + Returns + ------- + Panel + """ + from collections import defaultdict + + orient = orient.lower() + if orient == 'minor': + new_data = defaultdict(dict) + for col, df in data.iteritems(): + for item, s in df.iteritems(): + new_data[item][col] = s + data = new_data + elif orient != 'items': # pragma: no cover + raise ValueError('only recognize items or minor for orientation') + + data, index, columns = _homogenize_dict(data, intersect=intersect, + dtype=dtype) + items = Index(sorted(data.keys())) + return cls(data, items, index, columns) + + def __getitem__(self, key): + if isinstance(self.items, MultiIndex): + return self._getitem_multilevel(key) + return super(Panel, self).__getitem__(key) + + def _getitem_multilevel(self, key): + loc = self.items.get_loc(key) + if isinstance(loc, (slice, np.ndarray)): + new_index = self.items[loc] + result_index = _maybe_droplevels(new_index, key) + new_values = self.values[loc, :, :] + result = Panel(new_values, + items=result_index, + major_axis=self.major_axis, + minor_axis=self.minor_axis) + return result + else: + return self._get_item_cache(key) + + def _init_matrix(self, data, axes, dtype=None, copy=False): + values = _prep_ndarray(data, copy=copy) + + if dtype is not None: + try: + values = values.astype(dtype) + except Exception: + raise ValueError('failed to cast to %s' % dtype) + + shape = values.shape + fixed_axes = [] + for i, ax in enumerate(axes): + if ax is None: + ax = _default_index(shape[i]) + else: + ax = _ensure_index(ax) + fixed_axes.append(ax) + + items = fixed_axes[0] + block = make_block(values, items, items) + return BlockManager([block], fixed_axes) + + + #---------------------------------------------------------------------- + # Array interface + + def __array__(self, dtype=None): + return self.values + + def __array_wrap__(self, result): + return self._constructor(result, items=self.items, + major_axis=self.major_axis, + minor_axis=self.minor_axis, copy=False) + + #---------------------------------------------------------------------- + # Magic methods + + def __repr__(self): + class_name = str(self.__class__) + + I, N, K = len(self.items), len(self.major_axis), len(self.minor_axis) + + dims = 'Dimensions: %d (items) x %d (major) x %d (minor)' % (I, N, K) + + if len(self.major_axis) > 0: + major = 'Major axis: %s to %s' % (self.major_axis[0], + self.major_axis[-1]) + else: + major = 'Major axis: None' + + if len(self.minor_axis) > 0: + minor = 'Minor axis: %s to %s' % (self.minor_axis[0], + self.minor_axis[-1]) + else: + minor = 'Minor axis: None' + + if len(self.items) > 0: + items = 'Items: %s to %s' % (self.items[0], self.items[-1]) + else: + items = 'Items: None' + + output = '%s\n%s\n%s\n%s\n%s' % (class_name, dims, items, major, minor) + + return output + + def __iter__(self): + return iter(self.items) + + def iteritems(self): + for item in self.items: + yield item, self[item] + + # Name that won't get automatically converted to items by 2to3. items is + # already in use for the first axis. + iterkv = iteritems + + def _get_plane_axes(self, axis): + """ + + """ + axis = self._get_axis_name(axis) + + if axis == 'major_axis': + index = self.minor_axis + columns = self.items + if axis == 'minor_axis': + index = self.major_axis + columns = self.items + elif axis == 'items': + index = self.major_axis + columns = self.minor_axis + + return index, columns + + @property + def _constructor(self): + return type(self) + + # Fancy indexing + _ix = None + + @property + def ix(self): + if self._ix is None: + self._ix = _NDFrameIndexer(self) + + return self._ix + + def _wrap_array(self, arr, axes, copy=False): + items, major, minor = axes + return self._constructor(arr, items=items, major_axis=major, + minor_axis=minor, copy=copy) + + fromDict = from_dict + + def to_sparse(self, fill_value=None, kind='block'): + """ + Convert to SparsePanel + + Parameters + ---------- + fill_value : float, default NaN + kind : {'block', 'integer'} + + Returns + ------- + y : SparseDataFrame + """ + from pandas.core.sparse import SparsePanel + frames = dict(self.iterkv()) + return SparsePanel(frames, items=self.items, + major_axis=self.major_axis, + minor_axis=self.minor_axis, + default_kind=kind, + default_fill_value=fill_value) + + def to_excel(self, path, na_rep=''): + """ + Write each DataFrame in Panel to a separate excel sheet + + Parameters + ---------- + excel_writer : string or ExcelWriter object + File path or existing ExcelWriter + na_rep : string, default '' + Missing data rep'n + """ + from pandas.io.parsers import ExcelWriter + writer = ExcelWriter(path) + for item, df in self.iteritems(): + name = str(item) + df.to_excel(writer, name, na_rep=na_rep) + writer.save() + + # TODO: needed? + def keys(self): + return list(self.items) + + def _get_values(self): + self._consolidate_inplace() + return self._data.as_matrix() + + values = property(fget=_get_values) + + #---------------------------------------------------------------------- + # Getting and setting elements + + def get_value(self, item, major, minor): + """ + Quickly retrieve single value at (item, major, minor) location + + Parameters + ---------- + item : item label (panel item) + major : major axis label (panel item row) + minor : minor axis label (panel item column) + + Returns + ------- + value : scalar value + """ + # hm, two layers to the onion + frame = self._get_item_cache(item) + return frame.get_value(major, minor) + + def set_value(self, item, major, minor, value): + """ + Quickly set single value at (item, major, minor) location + + Parameters + ---------- + item : item label (panel item) + major : major axis label (panel item row) + minor : minor axis label (panel item column) + value : scalar + + Returns + ------- + panel : Panel + If label combo is contained, will be reference to calling Panel, + otherwise a new object + """ + try: + frame = self._get_item_cache(item) + frame.set_value(major, minor, value) + return self + except KeyError: + ax1, ax2, ax3 = self._expand_axes((item, major, minor)) + result = self.reindex(items=ax1, major=ax2, minor=ax3, copy=False) + + likely_dtype = com._infer_dtype(value) + made_bigger = not np.array_equal(ax1, self.items) + # how to make this logic simpler? + if made_bigger: + com._possibly_cast_item(result, item, likely_dtype) + + return result.set_value(item, major, minor, value) + + def _box_item_values(self, key, values): + return DataFrame(values, index=self.major_axis, columns=self.minor_axis) + + def __getattr__(self, name): + """After regular attribute access, try looking up the name of an item. + This allows simpler access to items for interactive use.""" + if name in self.items: + return self[name] + raise AttributeError("'%s' object has no attribute '%s'" % + (type(self).__name__, name)) + + def _slice(self, slobj, axis=0): + new_data = self._data.get_slice(slobj, axis=axis) + return self._constructor(new_data) + + def __setitem__(self, key, value): + _, N, K = self.shape + if isinstance(value, DataFrame): + value = value.reindex(index=self.major_axis, + columns=self.minor_axis) + mat = value.values + elif isinstance(value, np.ndarray): + assert(value.shape == (N, K)) + mat = np.asarray(value) + elif np.isscalar(value): + dtype = _infer_dtype(value) + mat = np.empty((N, K), dtype=dtype) + mat.fill(value) + + mat = mat.reshape((1, N, K)) + NDFrame._set_item(self, key, mat) + + def pop(self, item): + """ + Return item slice from panel and delete from panel + + Parameters + ---------- + key : object + Must be contained in panel's items + + Returns + ------- + y : DataFrame + """ + return NDFrame.pop(self, item) + + def __getstate__(self): + "Returned pickled representation of the panel" + return self._data + + def __setstate__(self, state): + # old Panel pickle + if isinstance(state, BlockManager): + self._data = state + elif len(state) == 4: # pragma: no cover + self._unpickle_panel_compat(state) + else: # pragma: no cover + raise ValueError('unrecognized pickle') + self._item_cache = {} + + def _unpickle_panel_compat(self, state): # pragma: no cover + "Unpickle the panel" + _unpickle = com._unpickle_array + vals, items, major, minor = state + + items = _unpickle(items) + major = _unpickle(major) + minor = _unpickle(minor) + values = _unpickle(vals) + wp = Panel(values, items, major, minor) + self._data = wp._data + + def conform(self, frame, axis='items'): + """ + Conform input DataFrame to align with chosen axis pair. + + Parameters + ---------- + frame : DataFrame + axis : {'items', 'major', 'minor'} + + Axis the input corresponds to. E.g., if axis='major', then + the frame's columns would be items, and the index would be + values of the minor axis + + Returns + ------- + DataFrame + """ + index, columns = self._get_plane_axes(axis) + return frame.reindex(index=index, columns=columns) + + def reindex(self, major=None, items=None, minor=None, method=None, + major_axis=None, minor_axis=None, copy=True): + """ + Conform panel to new axis or axes + + Parameters + ---------- + major : Index or sequence, default None + Can also use 'major_axis' keyword + items : Index or sequence, default None + minor : Index or sequence, default None + Can also use 'minor_axis' keyword + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + + Returns + ------- + Panel (new object) + """ + result = self + + major = _mut_exclusive(major, major_axis) + minor = _mut_exclusive(minor, minor_axis) + + if major is not None: + result = result._reindex_axis(major, method, 1, copy) + + if minor is not None: + result = result._reindex_axis(minor, method, 2, copy) + + if items is not None: + result = result._reindex_axis(items, method, 0, copy) + + if result is self and copy: + raise ValueError('Must specify at least one axis') + + return result + + def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True): + """Conform Panel to new index with optional filling logic, placing + NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + copy=False + + Parameters + ---------- + index : array-like, optional + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + axis : {0, 1} + 0 -> index (rows) + 1 -> columns + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed DataFrame + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Returns + ------- + reindexed : Panel + """ + self._consolidate_inplace() + return self._reindex_axis(labels, method, axis, copy) + + def reindex_like(self, other, method=None): + """ + Reindex Panel to match indices of another Panel + + Parameters + ---------- + other : Panel + method : string or None + + Returns + ------- + reindexed : Panel + """ + # todo: object columns + return self.reindex(major=other.major_axis, items=other.items, + minor=other.minor_axis, method=method) + + def _combine(self, other, func, axis=0): + if isinstance(other, Panel): + return self._combine_panel(other, func) + elif isinstance(other, DataFrame): + return self._combine_frame(other, func, axis=axis) + elif np.isscalar(other): + new_values = func(self.values, other) + return self._constructor(new_values, self.items, self.major_axis, + self.minor_axis) + + def __neg__(self): + return -1 * self + + def _combine_frame(self, other, func, axis=0): + index, columns = self._get_plane_axes(axis) + axis = self._get_axis_number(axis) + + other = other.reindex(index=index, columns=columns) + + if axis == 0: + new_values = func(self.values, other.values) + elif axis == 1: + new_values = func(self.values.swapaxes(0, 1), other.values.T) + new_values = new_values.swapaxes(0, 1) + elif axis == 2: + new_values = func(self.values.swapaxes(0, 2), other.values) + new_values = new_values.swapaxes(0, 2) + + return self._constructor(new_values, self.items, self.major_axis, + self.minor_axis) + + def _combine_panel(self, other, func): + items = self.items + other.items + major = self.major_axis + other.major_axis + minor = self.minor_axis + other.minor_axis + + # could check that everything's the same size, but forget it + this = self.reindex(items=items, major=major, minor=minor) + other = other.reindex(items=items, major=major, minor=minor) + + result_values = func(this.values, other.values) + + return self._constructor(result_values, items, major, minor) + + def fillna(self, value=None, method='pad'): + """ + Fill NaN values using the specified method. + + Member Series / TimeSeries are filled separately. + + Parameters + ---------- + value : any kind (should be same type as array) + Value to use to fill holes (e.g. 0) + + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + + Returns + ------- + y : DataFrame + + See also + -------- + DataFrame.reindex, DataFrame.asfreq + """ + if value is None: + result = {} + for col, s in self.iterkv(): + result[col] = s.fillna(method=method, value=value) + + return self._constructor.from_dict(result) + else: + new_data = self._data.fillna(value) + return self._constructor(new_data) + + add = _panel_arith_method(operator.add, 'add') + subtract = sub = _panel_arith_method(operator.sub, 'subtract') + multiply = mul = _panel_arith_method(operator.mul, 'multiply') + + try: + divide = div = _panel_arith_method(operator.div, 'divide') + except AttributeError: # pragma: no cover + # Python 3 + divide = div = _panel_arith_method(operator.truediv, 'divide') + + def major_xs(self, key, copy=True): + """ + Return slice of panel along major axis + + Parameters + ---------- + key : object + Major axis label + copy : boolean, default False + Copy data + + Returns + ------- + y : DataFrame + index -> minor axis, columns -> items + """ + return self.xs(key, axis=1, copy=copy) + + def minor_xs(self, key, copy=True): + """ + Return slice of panel along minor axis + + Parameters + ---------- + key : object + Minor axis label + copy : boolean, default False + Copy data + + Returns + ------- + y : DataFrame + index -> major axis, columns -> items + """ + return self.xs(key, axis=2, copy=copy) + + def xs(self, key, axis=1, copy=True): + """ + Return slice of panel along selected axis + + Parameters + ---------- + key : object + Label + axis : {'items', 'major', 'minor}, default 1/'major' + + Returns + ------- + y : DataFrame + """ + if axis == 0: + data = self[key] + if copy: + data = data.copy() + return data + + self._consolidate_inplace() + axis_number = self._get_axis_number(axis) + new_data = self._data.xs(key, axis=axis_number, copy=copy) + return DataFrame(new_data) + + def groupby(self, function, axis='major'): + """ + Group data on given axis, returning GroupBy object + + Parameters + ---------- + function : callable + Mapping function for chosen access + axis : {'major', 'minor', 'items'}, default 'major' + + Returns + ------- + grouped : PanelGroupBy + """ + from pandas.core.groupby import PanelGroupBy + axis = self._get_axis_number(axis) + return PanelGroupBy(self, function, axis=axis) + + def swapaxes(self, axis1='major', axis2='minor', copy=True): + """ + Interchange axes and swap values axes appropriately + + Returns + ------- + y : Panel (new object) + """ + i = self._get_axis_number(axis1) + j = self._get_axis_number(axis2) + + if i == j: + raise ValueError('Cannot specify the same axis') + + mapping = {i : j, j : i} + + new_axes = (self._get_axis(mapping.get(k, k)) + for k in range(3)) + new_values = self.values.swapaxes(i, j) + if copy: + new_values = new_values.copy() + + return self._constructor(new_values, *new_axes) + + def transpose(self, items='items', major='major', minor='minor', + copy=False): + """ + Permute the dimensions of the Panel + + Parameters + ---------- + items : int or one of {'items', 'major', 'minor'} + major : int or one of {'items', 'major', 'minor'} + minor : int or one of {'items', 'major', 'minor'} + copy : boolean, default False + Make a copy of the underlying data. Mixed-dtype data will + always result in a copy + + Examples + -------- + >>> p.transpose(2, 0, 1) + >>> p.transpose(2, 0, 1, copy=True) + + Returns + ------- + y : Panel (new object) + """ + i, j, k = [self._get_axis_number(x) for x in [items, major, minor]] + + if i == j or i == k or j == k: + raise ValueError('Must specify 3 unique axes') + + new_axes = [self._get_axis(x) for x in [i, j, k]] + new_values = self.values.transpose((i, j, k)) + if copy: + new_values = new_values.copy() + return self._constructor(new_values, *new_axes) + + def to_frame(self, filter_observations=True): + """ + Transform wide format into long (stacked) format as DataFrame + + Parameters + ---------- + filter_observations : boolean, default True + Drop (major, minor) pairs without a complete set of observations + across all the items + + Returns + ------- + y : DataFrame + """ + _, N, K = self.shape + + if filter_observations: + mask = com.notnull(self.values).all(axis=0) + # size = mask.sum() + selector = mask.ravel() + else: + # size = N * K + selector = slice(None, None) + + data = {} + for item in self.items: + data[item] = self[item].values.ravel()[selector] + + major_labels = np.arange(N).repeat(K)[selector] + + # Anyone think of a better way to do this? np.repeat does not + # do what I want + minor_labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] + minor_labels = minor_labels.ravel()[selector] + + index = MultiIndex(levels=[self.major_axis, self.minor_axis], + labels=[major_labels, minor_labels], + names=['major', 'minor']) + + return DataFrame(data, index=index, columns=self.items) + + to_long = deprecate('to_long', to_frame) + toLong = deprecate('toLong', to_frame) + + def filter(self, items): + """ + Restrict items in panel to input list + + Parameters + ---------- + items : sequence + + Returns + ------- + y : Panel + """ + intersection = self.items.intersection(items) + return self.reindex(items=intersection) + + def apply(self, func, axis='major'): + """ + Apply + + Parameters + ---------- + func : numpy function + Signature should match numpy.{sum, mean, var, std} etc. + axis : {'major', 'minor', 'items'} + fill_value : boolean, default True + Replace NaN values with specified first + + Returns + ------- + result : DataFrame or Panel + """ + i = self._get_axis_number(axis) + result = np.apply_along_axis(func, i, self.values) + return self._wrap_result(result, axis=axis) + + def _reduce(self, op, axis=0, skipna=True): + axis_name = self._get_axis_name(axis) + axis_number = self._get_axis_number(axis_name) + f = lambda x: op(x, axis=axis_number, skipna=skipna) + + result = f(self.values) + + index, columns = self._get_plane_axes(axis_name) + if axis_name != 'items': + result = result.T + + return DataFrame(result, index=index, columns=columns) + + def _wrap_result(self, result, axis): + axis = self._get_axis_name(axis) + index, columns = self._get_plane_axes(axis) + + if axis != 'items': + result = result.T + + return DataFrame(result, index=index, columns=columns) + + def count(self, axis='major'): + """ + Return number of observations over requested axis. + + Parameters + ---------- + axis : {'items', 'major', 'minor'} or {0, 1, 2} + + Returns + ------- + count : DataFrame + """ + i = self._get_axis_number(axis) + + values = self.values + mask = np.isfinite(values) + result = mask.sum(axis=i) + + return self._wrap_result(result, axis) + + @Substitution(desc='sum', outname='sum') + @Appender(_agg_doc) + def sum(self, axis='major', skipna=True): + return self._reduce(nanops.nansum, axis=axis, skipna=skipna) + + @Substitution(desc='mean', outname='mean') + @Appender(_agg_doc) + def mean(self, axis='major', skipna=True): + return self._reduce(nanops.nanmean, axis=axis, skipna=skipna) + + @Substitution(desc='unbiased variance', outname='variance') + @Appender(_agg_doc) + def var(self, axis='major', skipna=True): + return self._reduce(nanops.nanvar, axis=axis, skipna=skipna) + + @Substitution(desc='unbiased standard deviation', outname='stdev') + @Appender(_agg_doc) + def std(self, axis='major', skipna=True): + return self.var(axis=axis, skipna=skipna).apply(np.sqrt) + + @Substitution(desc='unbiased skewness', outname='skew') + @Appender(_agg_doc) + def skew(self, axis='major', skipna=True): + return self._reduce(nanops.nanskew, axis=axis, skipna=skipna) + + @Substitution(desc='product', outname='prod') + @Appender(_agg_doc) + def prod(self, axis='major', skipna=True): + return self._reduce(nanops.nanprod, axis=axis, skipna=skipna) + + @Substitution(desc='compounded percentage', outname='compounded') + @Appender(_agg_doc) + def compound(self, axis='major', skipna=True): + return (1 + self).prod(axis=axis, skipna=skipna) - 1 + + @Substitution(desc='median', outname='median') + @Appender(_agg_doc) + def median(self, axis='major', skipna=True): + return self._reduce(nanops.nanmedian, axis=axis, skipna=skipna) + + @Substitution(desc='maximum', outname='maximum') + @Appender(_agg_doc) + def max(self, axis='major', skipna=True): + return self._reduce(nanops.nanmax, axis=axis, skipna=skipna) + + @Substitution(desc='minimum', outname='minimum') + @Appender(_agg_doc) + def min(self, axis='major', skipna=True): + return self._reduce(nanops.nanmin, axis=axis, skipna=skipna) + + def shift(self, lags, axis='major'): + """ + Shift major or minor axis by specified number of lags. Drops periods + + Parameters + ---------- + lags : int + Needs to be a positive number currently + axis : {'major', 'minor'} + + Returns + ------- + shifted : Panel + """ + values = self.values + items = self.items + major_axis = self.major_axis + minor_axis = self.minor_axis + + if axis == 'major': + values = values[:, :-lags, :] + major_axis = major_axis[lags:] + elif axis == 'minor': + values = values[:, :, :-lags] + minor_axis = minor_axis[lags:] + else: + raise ValueError('Invalid axis') + + return self._constructor(values, items=items, major_axis=major_axis, + minor_axis=minor_axis) + + def truncate(self, before=None, after=None, axis='major'): + """Function truncates a sorted Panel before and/or after some + particular values on the requested axis + + Parameters + ---------- + before : date + Left boundary + after : date + Right boundary + axis : {'major', 'minor', 'items'} + + Returns + ------- + Panel + """ + axis = self._get_axis_name(axis) + index = self._get_axis(axis) + + beg_slice, end_slice = index.slice_locs(before, after) + new_index = index[beg_slice:end_slice] + + return self.reindex(**{axis : new_index}) + + def join(self, other, how='left', lsuffix='', rsuffix=''): + """ + Join items with other Panel either on major and minor axes column + + Parameters + ---------- + other : Panel or list of Panels + Index should be similar to one of the columns in this one + how : {'left', 'right', 'outer', 'inner'} + How to handle indexes of the two objects. Default: 'left' + for joining on index, None otherwise + * left: use calling frame's index + * right: use input frame's index + * outer: form union of indexes + * inner: use intersection of indexes + lsuffix : string + Suffix to use from left frame's overlapping columns + rsuffix : string + Suffix to use from right frame's overlapping columns + + Returns + ------- + joined : Panel + """ + from pandas.tools.merge import concat + + if isinstance(other, Panel): + join_major, join_minor = self._get_join_index(other, how) + this = self.reindex(major=join_major, minor=join_minor) + other = other.reindex(major=join_major, minor=join_minor) + merged_data = this._data.merge(other._data, lsuffix, rsuffix) + return self._constructor(merged_data) + else: + if lsuffix or rsuffix: + raise ValueError('Suffixes not supported when passing multiple ' + 'panels') + + if how == 'left': + how = 'outer' + join_axes = [self.major_axis, self.minor_axis] + elif how == 'right': + raise ValueError('Right join not supported with multiple ' + 'panels') + else: + join_axes = None + + return concat([self] + list(other), axis=0, join=how, + join_axes=join_axes, verify_integrity=True) + + def _get_join_index(self, other, how): + if how == 'left': + join_major, join_minor = self.major_axis, self.minor_axis + elif how == 'right': + join_major, join_minor = other.major_axis, other.minor_axis + elif how == 'inner': + join_major = self.major_axis.intersection(other.major_axis) + join_minor = self.minor_axis.intersection(other.minor_axis) + elif how == 'outer': + join_major = self.major_axis.union(other.major_axis) + join_minor = self.minor_axis.union(other.minor_axis) + return join_major, join_minor + +WidePanel = Panel +LongPanel = DataFrame + +def _prep_ndarray(values, copy=True): + if not isinstance(values, np.ndarray): + values = np.asarray(values) + # NumPy strings are a pain, convert to object + if issubclass(values.dtype.type, basestring): + values = np.array(values, dtype=object, copy=True) + else: + if copy: + values = values.copy() + assert(values.ndim == 3) + return values + +def _homogenize_dict(frames, intersect=True, dtype=None): + """ + Conform set of DataFrame-like objects to either an intersection + of indices / columns or a union. + + Parameters + ---------- + frames : dict + intersect : boolean, default True + + Returns + ------- + dict of aligned frames, index, columns + """ + result = {} + + adj_frames = {} + for k, v in frames.iteritems(): + if isinstance(v, dict): + adj_frames[k] = DataFrame(v) + else: + adj_frames[k] = v + + index = _extract_axis(adj_frames, axis=0, intersect=intersect) + columns = _extract_axis(adj_frames, axis=1, intersect=intersect) + + for key, frame in adj_frames.iteritems(): + result[key] = frame.reindex(index=index, columns=columns, + copy=False) + + return result, index, columns + + +def _extract_axis(data, axis=0, intersect=False): + if len(data) == 0: + index = Index([]) + elif len(data) > 0: + raw_lengths = [] + indexes = [] + + have_raw_arrays = False + have_frames = False + + for v in data.values(): + if isinstance(v, DataFrame): + have_frames = True + indexes.append(v._get_axis(axis)) + else: + have_raw_arrays = True + raw_lengths.append(v.shape[axis]) + + if have_frames: + index = _get_combined_index(indexes, intersect=intersect) + + if have_raw_arrays: + lengths = list(set(raw_lengths)) + if len(lengths) > 1: + raise ValueError('ndarrays must match shape on axis %d' % axis) + + if have_frames: + assert(lengths[0] == len(index)) + else: + index = Index(np.arange(lengths[0])) + + return _ensure_index(index) + + +def _monotonic(arr): + return not (arr[1:] < arr[:-1]).any() + +def install_ipython_completers(): # pragma: no cover + """Register the Panel type with IPython's tab completion machinery, so + that it knows about accessing column names as attributes.""" + from IPython.utils.generics import complete_object + + @complete_object.when_type(Panel) + def complete_dataframe(obj, prev_completions): + return prev_completions + [c for c in obj.items \ + if isinstance(c, basestring) and py3compat.isidentifier(c)] + +# Importing IPython brings in about 200 modules, so we want to avoid it unless +# we're in IPython (when those modules are loaded anyway). +if "IPython" in sys.modules: # pragma: no cover + try: + install_ipython_completers() + except Exception: + pass + diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py new file mode 100644 index 00000000..475a6822 --- /dev/null +++ b/pandas/core/reshape.py @@ -0,0 +1,754 @@ +# pylint: disable=E1101,E1103 +# pylint: disable=W0703,W0622,W0613,W0201 + +import itertools + +import numpy as np + +from pandas.core.series import Series +from pandas.core.frame import DataFrame + +from pandas.core.categorical import Categorical +from pandas.core.common import notnull, _ensure_platform_int +from pandas.core.groupby import (get_group_index, _compress_group_index, + decons_group_index) +import pandas.core.common as com +import pandas.lib as lib + + +from pandas.core.index import MultiIndex + + +class ReshapeError(Exception): + pass + + +class _Unstacker(object): + """ + Helper class to unstack data / pivot with multi-level index + + Parameters + ---------- + level : int or str, default last level + Level to "unstack". Accepts a name for the level. + + Examples + -------- + >>> s + one a 1. + one b 2. + two a 3. + two b 4. + + >>> s.unstack(level=-1) + a b + one 1. 2. + two 3. 4. + + >>> s.unstack(level=0) + one two + a 1. 2. + b 3. 4. + + Returns + ------- + unstacked : DataFrame + """ + def __init__(self, values, index, level=-1, value_columns=None): + if values.ndim == 1: + values = values[:, np.newaxis] + self.values = values + self.value_columns = value_columns + + if value_columns is None and values.shape[1] != 1: # pragma: no cover + raise ValueError('must pass column labels for multi-column data') + + self.index = index + self.level = self.index._get_level_number(level) + + self.new_index_levels = list(index.levels) + self.new_index_names = list(index.names) + + self.removed_name = self.new_index_names.pop(self.level) + self.removed_level = self.new_index_levels.pop(self.level) + + v = self.level + lshape = self.index.levshape + self.full_shape = np.prod(lshape[:v] + lshape[v+1:]), lshape[v] + + self._make_sorted_values_labels() + self._make_selectors() + + def _make_sorted_values_labels(self): + v = self.level + + labs = self.index.labels + levs = self.index.levels + to_sort = labs[:v] + labs[v+1:] + [labs[v]] + sizes = [len(x) for x in levs[:v] + levs[v+1:] + [levs[v]]] + + group_index = get_group_index(to_sort, sizes) + max_groups = np.prod(sizes) + if max_groups > 1000000: + comp_index, obs_ids = _compress_group_index(group_index) + ngroups = len(obs_ids) + else: + comp_index, ngroups = group_index, max_groups + + indexer = lib.groupsort_indexer(comp_index, ngroups)[0] + indexer = _ensure_platform_int(indexer) + + self.sorted_values = self.values.take(indexer, axis=0) + self.sorted_labels = [l.take(indexer) for l in to_sort] + + def _make_selectors(self): + new_levels = self.new_index_levels + + # make the mask + group_index = get_group_index(self.sorted_labels[:-1], + [len(x) for x in new_levels]) + + group_index = _ensure_platform_int(group_index) + + group_mask = np.zeros(self.full_shape[0], dtype=bool) + group_mask.put(group_index, True) + + stride = self.index.levshape[self.level] + selector = self.sorted_labels[-1] + stride * group_index + mask = np.zeros(np.prod(self.full_shape), dtype=bool) + mask.put(selector, True) + + # compress labels + unique_groups = np.arange(self.full_shape[0])[group_mask] + compressor = group_index.searchsorted(unique_groups) + + if mask.sum() < len(self.index): + raise ReshapeError('Index contains duplicate entries, ' + 'cannot reshape') + + self.group_mask = group_mask + self.group_index = group_index + self.mask = mask + self.unique_groups = unique_groups + self.compressor = compressor + + def get_result(self): + # TODO: find a better way than this masking business + + values, value_mask = self.get_new_values() + columns = self.get_new_columns() + index = self.get_new_index() + + # filter out missing levels + if values.shape[1] > 0: + mask = value_mask.sum(0) > 0 + values = values[:, mask] + columns = columns[mask] + + return DataFrame(values, index=index, columns=columns) + + def get_new_values(self): + values = self.values + # place the values + length, width = self.full_shape + stride = values.shape[1] + result_width = width * stride + + new_values = np.empty((length, result_width), dtype=values.dtype) + new_mask = np.zeros((length, result_width), dtype=bool) + + if issubclass(values.dtype.type, np.integer): + new_values = new_values.astype(float) + + new_values.fill(np.nan) + + # is there a simpler / faster way of doing this? + for i in xrange(values.shape[1]): + chunk = new_values[:, i * width : (i + 1) * width] + mask_chunk = new_mask[:, i * width : (i + 1) * width] + + chunk.flat[self.mask] = self.sorted_values[:, i] + mask_chunk.flat[self.mask] = True + + new_values = new_values.take(self.unique_groups, axis=0) + return new_values, new_mask + + def get_new_columns(self): + if self.value_columns is None: + return self.removed_level + + stride = len(self.removed_level) + width = len(self.value_columns) + propagator = np.repeat(np.arange(width), stride) + if isinstance(self.value_columns, MultiIndex): + new_levels = self.value_columns.levels + [self.removed_level] + new_names = self.value_columns.names + [self.removed_name] + + new_labels = [lab.take(propagator) + for lab in self.value_columns.labels] + new_labels.append(np.tile(np.arange(stride), width)) + else: + new_levels = [self.value_columns, self.removed_level] + new_names = [self.value_columns.name, self.removed_name] + + new_labels = [] + + new_labels.append(propagator) + new_labels.append(np.tile(np.arange(stride), width)) + + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + + def get_new_index(self): + result_labels = [] + for cur in self.sorted_labels[:-1]: + result_labels.append(cur.take(self.compressor)) + + # construct the new index + if len(self.new_index_levels) == 1: + new_index = self.new_index_levels[0].take(self.unique_groups) + new_index.name = self.new_index_names[0] + else: + new_index = MultiIndex(levels=self.new_index_levels, + labels=result_labels, + names=self.new_index_names) + + return new_index + + +def _unstack_multiple(data, clocs): + if len(clocs) == 0: + return data + + # NOTE: This doesn't deal with hierarchical columns yet + + index = data.index + + clocs = [index._get_level_number(i) for i in clocs] + + rlocs = [i for i in range(index.nlevels) if i not in clocs] + + clevels = [index.levels[i] for i in clocs] + clabels = [index.labels[i] for i in clocs] + cnames = [index.names[i] for i in clocs] + rlevels = [index.levels[i] for i in rlocs] + rlabels = [index.labels[i] for i in rlocs] + rnames = [index.names[i] for i in rlocs] + + shape = [len(x) for x in clevels] + group_index = get_group_index(clabels, shape) + + comp_ids, obs_ids = _compress_group_index(group_index, sort=False) + recons_labels = decons_group_index(obs_ids, shape) + + dummy_index = MultiIndex(levels=rlevels + [obs_ids], + labels=rlabels + [comp_ids], + names=rnames + ['__placeholder__']) + + if isinstance(data, Series): + dummy = Series(data.values, index=dummy_index) + unstacked = dummy.unstack('__placeholder__') + new_levels = clevels + new_names = cnames + new_labels = recons_labels + else: + if isinstance(data.columns, MultiIndex): + raise NotImplementedError('Unstacking multiple levels with ' + 'hierarchical columns not yet supported') + + dummy = DataFrame(data.values, index=dummy_index, + columns=data.columns) + + unstacked = dummy.unstack('__placeholder__') + if isinstance(unstacked, Series): + unstcols = unstacked.index + else: + unstcols = unstacked.columns + new_levels = [unstcols.levels[0]] + clevels + new_names = [data.columns.name] + cnames + + new_labels = [unstcols.labels[0]] + for rec in recons_labels: + new_labels.append(rec.take(unstcols.labels[-1])) + + new_columns = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + + if isinstance(unstacked, Series): + unstacked.index = new_columns + else: + unstacked.columns = new_columns + + return unstacked + +def pivot(self, index=None, columns=None, values=None): + """ + See DataFrame.pivot + """ + if values is None: + indexed = self.set_index([index, columns]) + return indexed.unstack(columns) + else: + indexed = Series(self[values], + index=[self[index], self[columns]]) + return indexed.unstack(columns) + +def pivot_simple(index, columns, values): + """ + Produce 'pivot' table based on 3 columns of this DataFrame. + Uses unique values from index / columns and fills with values. + + Parameters + ---------- + index : ndarray + Labels to use to make new frame's index + columns : ndarray + Labels to use to make new frame's columns + values : ndarray + Values to use for populating new frame's values + + Note + ---- + Obviously, all 3 of the input arguments must have the same length + + Returns + ------- + DataFrame + """ + assert(len(index) == len(columns) == len(values)) + + if len(index) == 0: + return DataFrame(index=[]) + + hindex = MultiIndex.from_arrays([index, columns]) + series = Series(values.ravel(), index=hindex) + series = series.sortlevel(0) + return series.unstack() + +def _slow_pivot(index, columns, values): + """ + Produce 'pivot' table based on 3 columns of this DataFrame. + Uses unique values from index / columns and fills with values. + + Parameters + ---------- + index : string or object + Column name to use to make new frame's index + columns : string or object + Column name to use to make new frame's columns + values : string or object + Column name to use for populating new frame's values + + Could benefit from some Cython here. + """ + tree = {} + for i, (idx, col) in enumerate(itertools.izip(index, columns)): + if col not in tree: + tree[col] = {} + branch = tree[col] + branch[idx] = values[i] + + return DataFrame(tree) + +def unstack(obj, level): + if isinstance(level, (tuple, list)): + return _unstack_multiple(obj, level) + + if isinstance(obj, DataFrame): + if isinstance(obj.index, MultiIndex): + return _unstack_frame(obj, level) + else: + return obj.T.stack(dropna=False) + else: + unstacker = _Unstacker(obj.values, obj.index, level=level) + return unstacker.get_result() + +def _unstack_frame(obj, level): + from pandas.core.internals import BlockManager, make_block + + if obj._is_mixed_type: + unstacker = _Unstacker(np.empty(obj.shape, dtype=bool), # dummy + obj.index, level=level, + value_columns=obj.columns) + new_columns = unstacker.get_new_columns() + new_index = unstacker.get_new_index() + new_axes = [new_columns, new_index] + + new_blocks = [] + mask_blocks = [] + for blk in obj._data.blocks: + bunstacker = _Unstacker(blk.values.T, obj.index, level=level, + value_columns=blk.items) + new_items = bunstacker.get_new_columns() + new_values, mask = bunstacker.get_new_values() + + mblk = make_block(mask.T, new_items, new_columns) + mask_blocks.append(mblk) + + newb = make_block(new_values.T, new_items, new_columns) + new_blocks.append(newb) + + result = DataFrame(BlockManager(new_blocks, new_axes)) + mask_frame = DataFrame(BlockManager(mask_blocks, new_axes)) + return result.ix[:, mask_frame.sum(0) > 0] + else: + unstacker = _Unstacker(obj.values, obj.index, level=level, + value_columns=obj.columns) + return unstacker.get_result() + +def stack(frame, level=-1, dropna=True): + """ + Convert DataFrame to Series with multi-level Index. Columns become the + second level of the resulting hierarchical index + + Returns + ------- + stacked : Series + """ + N, K = frame.shape + if isinstance(level, int) and level < 0: + level += frame.columns.nlevels + + level = frame.columns._get_level_number(level) + + if isinstance(frame.columns, MultiIndex): + return _stack_multi_columns(frame, level=level, dropna=True) + elif isinstance(frame.index, MultiIndex): + new_levels = list(frame.index.levels) + new_levels.append(frame.columns) + + new_labels = [lab.repeat(K) for lab in frame.index.labels] + new_labels.append(np.tile(np.arange(K), N).ravel()) + + new_names = list(frame.index.names) + new_names.append(frame.columns.name) + new_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + else: + ilabels = np.arange(N).repeat(K) + clabels = np.tile(np.arange(K), N).ravel() + new_index = MultiIndex(levels=[frame.index, frame.columns], + labels=[ilabels, clabels], + names=[frame.index.name, frame.columns.name]) + + new_values = frame.values.ravel() + if dropna: + mask = notnull(new_values) + new_values = new_values[mask] + new_index = new_index[mask] + return Series(new_values, index=new_index) + +def _stack_multi_columns(frame, level=-1, dropna=True): + this = frame.copy() + + # this makes life much simpler + if level != frame.columns.nlevels - 1: + # roll levels to put selected level at end + roll_columns = this.columns + for i in range(level, frame.columns.nlevels - 1): + roll_columns = roll_columns.swaplevel(i, i + 1) + this.columns = roll_columns + + if not this.columns.is_lexsorted(): + this = this.sortlevel(0, axis=1) + + # tuple list excluding level for grouping columns + if len(frame.columns.levels) > 2: + tuples = zip(*[lev.values.take(lab) + for lev, lab in zip(this.columns.levels[:-1], + this.columns.labels[:-1])]) + unique_groups = [key for key, _ in itertools.groupby(tuples)] + new_names = this.columns.names[:-1] + new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) + else: + new_columns = unique_groups = this.columns.levels[0] + + # time to ravel the values + new_data = {} + level_vals = this.columns.levels[-1] + levsize = len(level_vals) + for key in unique_groups: + loc = this.columns.get_loc(key) + + # can make more efficient? + if loc.stop - loc.start != levsize: + chunk = this.ix[:, this.columns[loc]] + chunk.columns = level_vals.take(chunk.columns.labels[-1]) + value_slice = chunk.reindex(columns=level_vals).values + else: + if frame._is_mixed_type: + value_slice = this.ix[:, this.columns[loc]].values + else: + value_slice = this.values[:, loc] + + new_data[key] = value_slice.ravel() + + N = len(this) + + if isinstance(this.index, MultiIndex): + new_levels = list(this.index.levels) + new_names = list(this.index.names) + new_labels = [lab.repeat(levsize) for lab in this.index.labels] + else: + new_levels = [this.index] + new_labels = [np.arange(N).repeat(levsize)] + new_names = [this.index.name] # something better? + + new_levels.append(frame.columns.levels[level]) + new_labels.append(np.tile(np.arange(levsize), N)) + new_names.append(frame.columns.names[level]) + + new_index = MultiIndex(levels=new_levels, labels=new_labels, + names=new_names) + + result = DataFrame(new_data, index=new_index, columns=new_columns) + + # more efficient way to go about this? can do the whole masking biz but + # will only save a small amount of time... + if dropna: + result = result.dropna(axis=0, how='all') + + return result + + +def melt(frame, id_vars=None, value_vars=None): + """ + "Unpivots" a DataFrame from wide format to long format, optionally leaving + id variables set + + Parameters + ---------- + frame : DataFrame + id_vars : + value_vars : + + Examples + -------- + >>> df + A B C + a 1 2 + b 3 4 + c 5 6 + + >>> melt(df, id_vars=['A']) + A variable value + a B 1 + b B 3 + c B 5 + a C 2 + b C 4 + c C 6 + """ + # TODO: what about the existing index? + + N, K = frame.shape + + mdata = {} + + if id_vars is not None: + id_vars = list(id_vars) + frame = frame.copy() + K -= len(id_vars) + for col in id_vars: + mdata[col] = np.tile(frame.pop(col).values, K) + else: + id_vars = [] + + mcolumns = id_vars + ['variable', 'value'] + + mdata['value'] = frame.values.ravel('F') + mdata['variable'] = np.asarray(frame.columns).repeat(N) + return DataFrame(mdata, columns=mcolumns) + + +def lreshape(data, groups, dropna=True, label=None): + """ + Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + + Parameters + ---------- + data : DataFrame + groups : dict + {new_name : list_of_columns} + dropna : boolean, default True + + Examples + -------- + >>> data + hr1 hr2 team year1 year2 + 0 514 545 Red Sox 2007 2008 + 1 573 526 Yankees 2007 2008 + + >>> pd.lreshape(data, {'year': ['year1', 'year2'], + 'hr': ['hr1', 'hr2']}) + team hr year + 0 Red Sox 514 2007 + 1 Yankees 573 2007 + 2 Red Sox 545 2008 + 3 Yankees 526 2008 + + Returns + ------- + reshaped : DataFrame + """ + if isinstance(groups, dict): + keys = groups.keys() + values = groups.values() + else: + keys, values = zip(*groups) + + all_cols = list(set.union(*[set(x) for x in values])) + id_cols = list(data.columns.diff(all_cols)) + + K = len(values[0]) + + for seq in values: + if len(seq) != K: + raise ValueError('All column lists must be same length') + + mdata = {} + pivot_cols = [] + + for target, names in zip(keys, values): + mdata[target] = com._concat_compat([data[col].values for col in names]) + pivot_cols.append(target) + + for col in id_cols: + mdata[col] = np.tile(data[col].values, K) + + if dropna: + mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) + for c in pivot_cols: + mask &= notnull(mdata[c]) + if not mask.all(): + mdata = dict((k, v[mask]) for k, v in mdata.iteritems()) + + return DataFrame(mdata, columns=id_cols + pivot_cols) + + +def convert_dummies(data, cat_variables, prefix_sep='_'): + """ + Compute DataFrame with specified columns converted to dummy variables (0 / + 1). Result columns will be prefixed with the column name, then the level + name, e.g. 'A_foo' for column A and level foo + + Parameters + ---------- + data : DataFrame + cat_variables : list-like + Must be column names in the DataFrame + prefix_sep : string, default '_' + String to use to separate column name from dummy level + + Returns + ------- + dummies : DataFrame + """ + result = data.drop(cat_variables, axis=1) + for variable in cat_variables: + dummies = get_dummies(data[variable], prefix=variable, + prefix_sep=prefix_sep) + result = result.join(dummies) + return result + + +def get_dummies(data, prefix=None, prefix_sep='_'): + """ + Convert categorical variable into dummy/indicator variables + + Parameters + ---------- + data : array-like or Series + prefix : string, default None + String to append DataFrame column names + prefix_sep : string, default '_' + If appending prefix, separator/delimiter to use + + Returns + ------- + dummies : DataFrame + """ + cat = Categorical.from_array(np.asarray(data)) + dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0) + + if prefix is not None: + dummy_cols = ['%s%s%s' % (prefix, prefix_sep, str(v)) + for v in cat.levels] + else: + dummy_cols = cat.levels + + if isinstance(data, Series): + index = data.index + else: + index = None + + return DataFrame(dummy_mat, index=index, columns=dummy_cols) + + +def make_axis_dummies(frame, axis='minor', transform=None): + """ + Construct 1-0 dummy variables corresponding to designated axis + labels + + Parameters + ---------- + axis : {'major', 'minor'}, default 'minor' + transform : function, default None + Function to apply to axis labels first. For example, to + get "day of week" dummies in a time series regression you might + call: + make_axis_dummies(panel, axis='major', + transform=lambda d: d.weekday()) + Returns + ------- + dummies : DataFrame + Column names taken from chosen axis + """ + numbers = { + 'major' : 0, + 'minor' : 1 + } + num = numbers.get(axis, axis) + + items = frame.index.levels[num] + labels = frame.index.labels[num] + if transform is not None: + mapped_items = items.map(transform) + cat = Categorical.from_array(mapped_items.take(labels)) + labels = cat.labels + items = cat.levels + + values = np.eye(len(items), dtype=float) + values = values.take(labels, axis=0) + + return DataFrame(values, columns=items, index=frame.index) + +def block2d_to_block3d(values, items, shape, major_labels, minor_labels, + ref_items=None): + """ + Developer method for pivoting DataFrame -> Panel. Used in HDFStore and + DataFrame.to_panel + """ + from pandas.core.internals import make_block + panel_shape = (len(items),) + shape + + # TODO: lexsort depth needs to be 2!! + + # Create observation selection vector using major and minor + # labels, for converting to panel format. + selector = minor_labels + shape[1] * major_labels + mask = np.zeros(np.prod(shape), dtype=bool) + mask.put(selector, True) + + pvalues = np.empty(panel_shape, dtype=values.dtype) + if not issubclass(pvalues.dtype.type, np.integer): + pvalues.fill(np.nan) + + values = values + for i in xrange(len(items)): + pvalues[i].flat[mask] = values[:, i] + + if ref_items is None: + ref_items = items + + return make_block(pvalues, items, ref_items) diff --git a/pandas/core/series.py b/pandas/core/series.py new file mode 100644 index 00000000..1243a850 --- /dev/null +++ b/pandas/core/series.py @@ -0,0 +1,2866 @@ +""" +Data structure for 1-dimensional cross-sectional and time series data +""" + +# pylint: disable=E1101,E1103 +# pylint: disable=W0703,W0622,W0613,W0201 + +from itertools import izip +import operator +from distutils.version import LooseVersion + +from numpy import nan, ndarray +import numpy as np +import numpy.ma as ma + +from pandas.core.common import (isnull, notnull, _is_bool_indexer, + _default_index, _maybe_upcast, + _asarray_tuplesafe) +from pandas.core.index import (Index, MultiIndex, InvalidIndexError, + _ensure_index, _handle_legacy_indexes) +from pandas.core.indexing import _SeriesIndexer +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex +from pandas.util import py3compat +from pandas.util.terminal import get_terminal_size +import pandas.core.common as com +import pandas.core.datetools as datetools +import pandas.core.format as fmt +import pandas.core.generic as generic +import pandas.core.nanops as nanops +import pandas.lib as lib +from pandas.util.decorators import Appender, Substitution + +from pandas.compat.scipy import scoreatpercentile as _quantile + +__all__ = ['Series', 'TimeSeries'] + +_np_version = np.version.short_version +_np_version_under1p6 = LooseVersion(_np_version) < '1.6' + +_SHOW_WARNINGS = True + +#---------------------------------------------------------------------- +# Wrapper function for Series arithmetic methods + +def _arith_method(op, name): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def na_op(x, y): + try: + result = op(x, y) + except TypeError: + result = np.empty(len(x), dtype=x.dtype) + if isinstance(y, np.ndarray): + mask = notnull(x) & notnull(y) + result[mask] = op(x[mask], y[mask]) + else: + mask = notnull(x) + result[mask] = op(x[mask], y) + np.putmask(result, -mask, np.nan) + + return result + + def wrapper(self, other): + from pandas.core.frame import DataFrame + + if isinstance(other, Series): + if self.index.equals(other.index): + name = _maybe_match_name(self, other) + return Series(na_op(self.values, other.values), + index=self.index, name=name) + + join_idx, lidx, ridx = self.index.join(other.index, how='outer', + return_indexers=True) + + lvalues = self.values + rvalues = other.values + + if lidx is not None: + lvalues = com.take_1d(lvalues, lidx) + + if ridx is not None: + rvalues = com.take_1d(rvalues, ridx) + + arr = na_op(lvalues, rvalues) + + name = _maybe_match_name(self, other) + return Series(arr, index=join_idx, name=name) + elif isinstance(other, DataFrame): + return NotImplemented + else: + # scalars + return Series(na_op(self.values, other), + index=self.index, name=self.name) + return wrapper + + +def _comp_method(op, name): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def na_op(x, y): + if x.dtype == np.object_: + if isinstance(y, list): + y = lib.list_to_object_array(y) + + if isinstance(y, np.ndarray): + result = lib.vec_compare(x, y, op) + else: + result = lib.scalar_compare(x, y, op) + else: + result = op(x, y) + + return result + + def wrapper(self, other): + from pandas.core.frame import DataFrame + + if isinstance(other, Series): + name = _maybe_match_name(self, other) + return Series(na_op(self.values, other.values), + index=self.index, name=name) + elif isinstance(other, DataFrame): # pragma: no cover + return NotImplemented + elif isinstance(other, np.ndarray): + return Series(na_op(self.values, np.asarray(other)), + index=self.index, name=self.name) + else: + values = self.values + other = lib.convert_scalar(values, other) + + if issubclass(values.dtype.type, np.datetime64): + values = values.view('i8') + + # scalars + res = na_op(values, other) + if np.isscalar(res): + raise TypeError('Could not compare %s type with Series' + % type(other)) + return Series(na_op(values, other), + index=self.index, name=self.name) + return wrapper + + +def _bool_method(op, name): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def na_op(x, y): + try: + result = op(x, y) + except TypeError: + if isinstance(y, list): + y = lib.list_to_object_array(y) + + if isinstance(y, np.ndarray): + if (x.dtype == np.bool_ and + y.dtype == np.bool_): # pragma: no cover + result = op(x, y) # when would this be hit? + else: + x = com._ensure_object(x) + y = com._ensure_object(y) + result = lib.vec_binop(x, y, op) + else: + result = lib.scalar_binop(x, y, op) + + return result + + def wrapper(self, other): + from pandas.core.frame import DataFrame + + if isinstance(other, Series): + name = _maybe_match_name(self, other) + return Series(na_op(self.values, other.values), + index=self.index, name=name) + elif isinstance(other, DataFrame): + return NotImplemented + else: + # scalars + return Series(na_op(self.values, other), + index=self.index, name=self.name) + return wrapper + + + +def _radd_compat(left, right): + radd = lambda x, y: y + x + # GH #353, NumPy 1.5.1 workaround + try: + output = radd(left, right) + except TypeError: + cond = (_np_version_under1p6 and + left.dtype == np.object_) + if cond: # pragma: no cover + output = np.empty_like(left) + output.flat[:] = [radd(x, right) for x in left.flat] + else: + raise + + return output + + +def _maybe_match_name(a, b): + name = None + if a.name == b.name: + name = a.name + return name + + +def _flex_method(op, name): + doc = """ + Binary operator %s with support to substitute a fill_value for missing data + in one of the inputs + + Parameters + ---------- + other: Series or scalar value + fill_value : None or float value, default None (NaN) + Fill missing (NaN) values with this value. If both Series are + missing, the result will be missing + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Returns + ------- + result : Series + """ % name + + @Appender(doc) + def f(self, other, level=None, fill_value=None): + return self._binop(other, op, level=level, fill_value=fill_value) + + f.__name__ = name + return f + +def _unbox(func): + @Appender(func.__doc__) + def f(self, *args, **kwargs): + result = func(self, *args, **kwargs) + if isinstance(result, np.ndarray) and result.ndim == 0: + return result.item() + else: # pragma: no cover + return result + f.__name__ = func.__name__ + return f + +_stat_doc = """ +Return %(name)s of values +%(na_action)s + +Parameters +---------- +skipna : boolean, default True + Exclude NA/null values +level : int, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a smaller Series +%(extras)s +Returns +------- +%(shortname)s : float (or Series if level specified) +""" +_doc_exclude_na = "NA/null values are excluded" +_doc_ndarray_interface = ("Extra parameters are to preserve ndarray" + "interface.\n") + + +def _make_stat_func(nanop, name, shortname, na_action=_doc_exclude_na, + extras=_doc_ndarray_interface): + + @Substitution(name=name, shortname=shortname, + na_action=na_action, extras=extras) + @Appender(_stat_doc) + def f(self, axis=0, dtype=None, out=None, skipna=True, level=None): + if level is not None: + return self._agg_by_level(shortname, level=level, skipna=skipna) + return nanop(self.values, skipna=skipna) + f.__name__ = shortname + return f + +#---------------------------------------------------------------------- +# Series class + +class Series(np.ndarray, generic.PandasObject): + _AXIS_NUMBERS = { + 'index' : 0 + } + + _AXIS_NAMES = dict((v, k) for k, v in _AXIS_NUMBERS.iteritems()) + + def __new__(cls, data=None, index=None, dtype=None, name=None, + copy=False): + if data is None: + data = {} + + if index is not None: + index = _ensure_index(index) + + if isinstance(data, Series): + if index is None: + index = data.index + elif isinstance(data, dict): + if index is None: + index = Index(sorted(data)) + try: + if isinstance(index, DatetimeIndex): + # coerce back to datetime objects for lookup + data = lib.fast_multiget(data, index.astype('O'), + default=np.nan) + elif isinstance(index, PeriodIndex): + data = [data.get(i, nan) for i in index] + else: + data = lib.fast_multiget(data, index.values, default=np.nan) + except TypeError: + data = [data.get(i, nan) for i in index] + + if dtype is not None: + dtype = np.dtype(dtype) + + subarr = _sanitize_array(data, index, dtype, copy, + raise_cast_failure=True) + + if not isinstance(subarr, np.ndarray): + return subarr + + if index is None: + index = _default_index(len(subarr)) + + # Change the class of the array to be the subclass type. + if index.is_all_dates: + if not isinstance(index, (DatetimeIndex, PeriodIndex)): + index = DatetimeIndex(index) + subarr = subarr.view(TimeSeries) + else: + subarr = subarr.view(Series) + subarr.index = index + subarr.name = name + + return subarr + + def __init__(self, data=None, index=None, dtype=None, name=None, + copy=False): + """One-dimensional ndarray with axis labels (including time +series). Labels need not be unique but must be any hashable type. The object +supports both integer- and label-based indexing and provides a host of methods +for performing operations involving the index. Statistical methods from ndarray +have been overridden to automatically exclude missing data (currently +represented as NaN) + +Operations between Series (+, -, /, *, **) align values based on their +associated index values-- they need not be the same length. The result +index will be the sorted union of the two indexes. + +Parameters +---------- +data : array-like, dict, or scalar value + Contains data stored in Series +index : array-like or Index (1d) + + Values must be unique and hashable, same length as data. Index object + (or other iterable of same length as data) Will default to + np.arange(len(data)) if not provided. If both a dict and index sequence + are used, the index will override the keys found in the dict. + +dtype : numpy.dtype or None + If None, dtype will be inferred copy : boolean, default False Copy + input data +copy : boolean, default False + """ + pass + + @property + def _constructor(self): + return Series + + def __hash__(self): + raise TypeError('unhashable type') + + _index = None + index = lib.SeriesIndex() + + def __array_finalize__(self, obj): + """ + Gets called after any ufunc or other array operations, necessary + to pass on the index. + """ + self._index = getattr(obj, '_index', None) + self.name = getattr(obj, 'name', None) + + def __contains__(self, key): + return key in self.index + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(ndarray.__reduce__(self)) + subclass_state = (self.index, self.name) + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + nd_state, own_state = state + ndarray.__setstate__(self, nd_state) + + # backwards compat + index, name = own_state[0], None + if len(own_state) > 1: + name = own_state[1] + + self.index = _handle_legacy_indexes([index])[0] + self.name = name + + _ix = None + + @property + def ix(self): + if self._ix is None: + self._ix = _SeriesIndexer(self) + + return self._ix + + def __getitem__(self, key): + try: + return self.index.get_value(self, key) + except InvalidIndexError: + pass + except KeyError: + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # kludge + pass + else: + raise + except Exception: + raise + + if com.is_iterator(key): + key = list(key) + + # boolean + # special handling of boolean data with NAs stored in object + # arrays. Since we can't represent NA with dtype=bool + if _is_bool_indexer(key): + key = self._check_bool_indexer(key) + key = np.asarray(key, dtype=bool) + + return self._get_with(key) + + def _get_with(self, key): + # other: fancy integer or otherwise + if isinstance(key, slice): + from pandas.core.indexing import _is_index_slice + + idx_type = self.index.inferred_type + if idx_type == 'floating': + indexer = self.ix._convert_to_indexer(key, axis=0) + elif idx_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + return self._get_values(indexer) + else: + if isinstance(key, tuple): + try: + return self._get_values_tuple(key) + except: + if len(key) == 1: + key = key[0] + if isinstance(key, slice): + return self._get_values(key) + raise + + if not isinstance(key, (list, np.ndarray)): # pragma: no cover + key = list(key) + + if isinstance(key, Index): + key_type = lib.infer_dtype(key.values) + else: + key_type = lib.infer_dtype(key) + + if key_type == 'integer': + if self.index.inferred_type == 'integer': + return self.reindex(key) + else: + return self._get_values(key) + elif key_type == 'boolean': + return self._get_values(key) + else: + try: + return self.reindex(key) + except Exception: + # [slice(0, 5, None)] will break if you convert to ndarray, + # e.g. as requested by np.median + # hack + if isinstance(key[0], slice): + return self._get_values(key) + raise + + def _get_values_tuple(self, key): + # mpl hackaround + if any(k is None for k in key): + return self._get_values(key) + + if not isinstance(self.index, MultiIndex): + raise ValueError('Can only tuple-index with a MultiIndex') + + # If key is contained, would have returned by now + indexer, new_index = self.index.get_loc_level(key) + return Series(self.values[indexer], index=new_index, name=self.name) + + def _get_values(self, indexer): + try: + return Series(self.values[indexer], index=self.index[indexer], + name=self.name) + except Exception: + return self.values[indexer] + + def __setitem__(self, key, value): + try: + try: + self.index._engine.set_value(self, key, value) + return + except KeyError: + values = self.values + values[self.index.get_loc(key)] = value + return + except KeyError: + if (com.is_integer(key) + and not self.index.inferred_type == 'integer'): + + values[key] = value + return + + raise KeyError('%s not in this series!' % str(key)) + except TypeError, e: + # python 3 type errors should be raised + if 'unorderable' in str(e): # pragma: no cover + raise IndexError(key) + # Could not hash item + + if _is_bool_indexer(key): + key = self._check_bool_indexer(key) + key = np.asarray(key, dtype=bool) + + self._set_with(key, value) + + def _set_with(self, key, value): + # other: fancy integer or otherwise + if isinstance(key, slice): + from pandas.core.indexing import _is_index_slice + if self.index.inferred_type == 'integer' or _is_index_slice(key): + indexer = key + else: + indexer = self.ix._convert_to_indexer(key, axis=0) + return self._set_values(indexer, value) + else: + if isinstance(key, tuple): + try: + self._set_values(key, value) + except Exception: + pass + + if not isinstance(key, (list, np.ndarray)): + key = list(key) + + if isinstance(key, Index): + key_type = lib.infer_dtype(key.values) + else: + key_type = lib.infer_dtype(key) + + if key_type == 'integer': + if self.index.inferred_type == 'integer': + self._set_labels(key, value) + else: + return self._set_values(key, value) + elif key_type == 'boolean': + self._set_values(key, value) + else: + self._set_labels(key, value) + + def _set_labels(self, key, value): + if isinstance(key, Index): + key = key.values + else: + key = _asarray_tuplesafe(key) + indexer = self.index.get_indexer(key) + mask = indexer == -1 + if mask.any(): + raise ValueError('%s not contained in the index' + % str(key[mask])) + self._set_values(indexer, value) + + def _set_values(self, key, value): + values = self.values + values[key] = lib.convert_scalar(values, value) + + # help out SparseSeries + _get_val_at = ndarray.__getitem__ + + def __getslice__(self, i, j): + if i < 0: + i = 0 + if j < 0: + j = 0 + slobj = slice(i, j) + return self.__getitem__(slobj) + + def _check_bool_indexer(self, key): + # boolean indexing, need to check that the data are aligned, otherwise + # disallowed + result = key + if isinstance(key, Series) and key.dtype == np.bool_: + if not key.index.equals(self.index): + result = key.reindex(self.index) + + if isinstance(result, np.ndarray) and result.dtype == np.object_: + mask = isnull(result) + if mask.any(): + raise ValueError('cannot index with vector containing ' + 'NA / NaN values') + + return result + + def __setslice__(self, i, j, value): + """Set slice equal to given value(s)""" + if i < 0: + i = 0 + if j < 0: + j = 0 + slobj = slice(i, j) + return self.__setitem__(slobj, value) + + def astype(self, dtype): + """ + See numpy.ndarray.astype + """ + casted = com._astype_nansafe(self.values, dtype) + return self._constructor(casted, index=self.index, name=self.name) + + def repeat(self, reps): + """ + See ndarray.repeat + """ + new_index = self.index.repeat(reps) + new_values = self.values.repeat(reps) + return Series(new_values, index=new_index, name=self.name) + + def reshape(self, newshape, order='C'): + """ + See numpy.ndarray.reshape + """ + if isinstance(newshape, tuple) and len(newshape) > 1: + return self.values.reshape(newshape, order=order) + else: + return ndarray.reshape(self, newshape, order) + + def get(self, label, default=None): + """ + Returns value occupying requested label, default to specified + missing value if not present. Analogous to dict.get + + Parameters + ---------- + label : object + Label value looking for + default : object, optional + Value to return if label not in index + + Returns + ------- + y : scalar + """ + try: + return self.get_value(label) + except KeyError: + return default + + def iget_value(self, i): + """ + Return the i-th value or values in the Series by location + + Parameters + ---------- + i : int, slice, or sequence of integers + + Returns + ------- + value : scalar (int) or Series (slice, sequence) + """ + if isinstance(i, slice): + return self[i] + else: + label = self.index[i] + if isinstance(label, Index): + return self.reindex(label) + else: + return self[label] + + iget = iget_value + irow = iget_value + + def get_value(self, label): + """ + Quickly retrieve single value at passed index label + + Parameters + ---------- + index : label + + Returns + ------- + value : scalar value + """ + return self.index._engine.get_value(self, label) + + def set_value(self, label, value): + """ + Quickly set single value at passed label. If label is not contained, a + new object is created with the label placed at the end of the result + index + + Parameters + ---------- + label : object + Partial indexing with MultiIndex not allowed + value : object + Scalar value + + Returns + ------- + series : Series + If label is contained, will be reference to calling Series, + otherwise a new object + """ + try: + self.index._engine.set_value(self, label, value) + return self + except KeyError: + new_index = np.concatenate([self.index.values, [label]]) + new_values = np.concatenate([self.values, [value]]) + return Series(new_values, index=new_index, name=self.name) + + def reset_index(self, drop=False, name=None): + """ + Analagous to the DataFrame.reset_index function, see docstring there. + + Parameters + ---------- + drop : boolean, default False + Do not try to insert index into dataframe columns + name : object, default None + The name of the column corresponding to the Series values + + Returns + ---------- + resetted : DataFrame, or Series if drop == True + """ + if drop: + return Series(self, index=np.arange(len(self)), name=self.name) + else: + from pandas.core.frame import DataFrame + if name is None: + df = DataFrame(self) + else: + df = DataFrame({name : self}) + + return df.reset_index(drop=drop) + + def __repr__(self): + """Clean string representation of a Series""" + width, height = get_terminal_size() + max_rows = (height if fmt.print_config.max_rows == 0 + else fmt.print_config.max_rows) + if len(self.index) > (max_rows or 1000): + result = self._tidy_repr(min(30, max_rows - 4)) + elif len(self.index) > 0: + result = self._get_repr(print_header=True, + length=len(self) > 50, + name=True) + else: + result = '%s' % ndarray.__repr__(self) + + return com.console_encode(result) + + def _tidy_repr(self, max_vals=20): + num = max_vals // 2 + head = self[:num]._get_repr(print_header=True, length=False, + name=False) + tail = self[-(max_vals - num):]._get_repr(print_header=False, + length=False, + name=False) + result = head + '\n...\n' + tail + return '%s\n%s' % (result, self._repr_footer()) + + def _repr_footer(self): + namestr = "Name: %s, " % str(self.name) if self.name else "" + return '%sLength: %d' % (namestr, len(self)) + + def to_string(self, buf=None, na_rep='NaN', float_format=None, + nanRep=None, length=False, name=False): + """ + Render a string representation of the Series + + Parameters + ---------- + buf : StringIO-like, optional + buffer to write to + na_rep : string, optional + string representation of NAN to use, default 'NaN' + float_format : one-parameter function, optional + formatter function to apply to columns' elements if they are floats + default None + length : boolean, default False + Add the Series length + name : boolean, default False + Add the Series name (which may be None) + + Returns + ------- + formatted : string (if not buffer passed) + """ + + if nanRep is not None: # pragma: no cover + import warnings + warnings.warn("nanRep is deprecated, use na_rep", FutureWarning) + na_rep = nanRep + + the_repr = self._get_repr(float_format=float_format, na_rep=na_rep, + length=length, name=name) + if buf is None: + return the_repr + else: + print >> buf, the_repr + + def _get_repr(self, name=False, print_header=False, length=True, + na_rep='NaN', float_format=None): + formatter = fmt.SeriesFormatter(self, name=name, header=print_header, + length=length, na_rep=na_rep, + float_format=float_format) + return formatter.to_string() + + def __str__(self): + return repr(self) + + def __iter__(self): + if np.issubdtype(self.dtype, np.datetime64): + return (lib.Timestamp(x) for x in self.values) + else: + return iter(self.values) + + def iteritems(self, index=True): + """ + Lazily iterate over (index, value) tuples + """ + return izip(iter(self.index), iter(self)) + + iterkv = iteritems + if py3compat.PY3: # pragma: no cover + items = iteritems + + #---------------------------------------------------------------------- + # Arithmetic operators + + __add__ = _arith_method(operator.add, '__add__') + __sub__ = _arith_method(operator.sub, '__sub__') + __mul__ = _arith_method(operator.mul, '__mul__') + __truediv__ = _arith_method(operator.truediv, '__truediv__') + __floordiv__ = _arith_method(operator.floordiv, '__floordiv__') + __pow__ = _arith_method(operator.pow, '__pow__') + + __radd__ = _arith_method(_radd_compat, '__add__') + __rmul__ = _arith_method(operator.mul, '__mul__') + __rsub__ = _arith_method(lambda x, y: y - x, '__sub__') + __rtruediv__ = _arith_method(lambda x, y: y / x, '__truediv__') + __rfloordiv__ = _arith_method(lambda x, y: y // x, '__floordiv__') + __rpow__ = _arith_method(lambda x, y: y ** x, '__pow__') + + # comparisons + __gt__ = _comp_method(operator.gt, '__gt__') + __ge__ = _comp_method(operator.ge, '__ge__') + __lt__ = _comp_method(operator.lt, '__lt__') + __le__ = _comp_method(operator.le, '__le__') + __eq__ = _comp_method(operator.eq, '__eq__') + __ne__ = _comp_method(operator.ne, '__ne__') + + # binary logic + __or__ = _bool_method(operator.or_, '__or__') + __and__ = _bool_method(operator.and_, '__and__') + __xor__ = _bool_method(operator.xor, '__xor__') + + # Inplace operators + __iadd__ = __add__ + __isub__ = __sub__ + __imul__ = __mul__ + __itruediv__ = __truediv__ + __ifloordiv__ = __floordiv__ + __ipow__ = __pow__ + + # Python 2 division operators + if not py3compat.PY3: + __div__ = _arith_method(operator.div, '__div__') + __rdiv__ = _arith_method(lambda x, y: y / x, '__div__') + __idiv__ = __div__ + + #---------------------------------------------------------------------- + # unbox reductions + + all = _unbox(np.ndarray.all) + any = _unbox(np.ndarray.any) + + #---------------------------------------------------------------------- + # Misc public methods + + def keys(self): + "Alias for index" + return self.index + + # alas, I wish this worked + # values = lib.ValuesProperty() + + @property + def values(self): + """ + Return Series as ndarray + + Returns + ------- + arr : numpy.ndarray + """ + return self.view(ndarray) + + def copy(self, order='C'): + """ + Return new Series with copy of underlying values + + Returns + ------- + cp : Series + """ + return Series(self.values.copy(order), index=self.index, + name=self.name) + + def to_dict(self): + """ + Convert Series to {label -> value} dict + + Returns + ------- + value_dict : dict + """ + return dict(self.iteritems()) + + def to_sparse(self, kind='block', fill_value=None): + """ + Convert Series to SparseSeries + + Parameters + ---------- + kind : {'block', 'integer'} + fill_value : float, defaults to NaN (missing) + + Returns + ------- + sp : SparseSeries + """ + from pandas.core.sparse import SparseSeries + return SparseSeries(self, kind=kind, fill_value=fill_value, + name=self.name) + + def head(self, n=5): + """Returns first n rows of Series + """ + return self[:n] + + def tail(self, n=5): + """Returns last n rows of Series + """ + return self[-n:] + + #---------------------------------------------------------------------- + # Statistics, overridden ndarray methods + + # TODO: integrate bottleneck + + def count(self, level=None): + """ + Return number of non-NA/null observations in the Series + + Parameters + ---------- + level : int, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a smaller Series + + Returns + ------- + nobs : int or Series (if level specified) + """ + if level is not None: + mask = notnull(self.values) + + if isinstance(level, basestring): + level = self.index._get_level_number(level) + + level_index = self.index.levels[level] + + if len(self) == 0: + return Series(0, index=level_index) + + # call cython function + max_bin = len(level_index) + labels = com._ensure_int64(self.index.labels[level]) + counts = lib.count_level_1d(mask.view(np.uint8), + labels, max_bin) + return Series(counts, index=level_index) + + return notnull(self.values).sum() + + def value_counts(self): + """ + Returns Series containing counts of unique values. The resulting Series + will be in descending order so that the first element is the most + frequently-occurring element. Excludes NA values + + Returns + ------- + counts : Series + """ + from pandas.core.algorithms import value_counts + return value_counts(self.values, sort=True, ascending=False) + + def unique(self): + """ + Return array of unique values in the Series. Significantly faster than + numpy.unique + + Returns + ------- + uniques : ndarray + """ + return nanops.unique1d(self.values) + + def nunique(self): + """ + Return count of unique elements in the Series + + Returns + ------- + nunique : int + """ + return len(self.value_counts()) + + sum = _make_stat_func(nanops.nansum, 'sum', 'sum') + mean = _make_stat_func(nanops.nanmean, 'mean', 'mean') + median = _make_stat_func(nanops.nanmedian, 'median', 'median', extras='') + prod = _make_stat_func(nanops.nanprod, 'product', 'prod', extras='') + + @Substitution(name='mean absolute deviation', shortname='mad', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def mad(self, skipna=True, level=None): + if level is not None: + return self._agg_by_level('mad', level=level, skipna=skipna) + + demeaned = self - self.mean(skipna=skipna) + return np.abs(demeaned).mean(skipna=skipna) + + @Substitution(name='minimum', shortname='min', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def min(self, axis=None, out=None, skipna=True, level=None): + if level is not None: + return self._agg_by_level('min', level=level, skipna=skipna) + return nanops.nanmin(self.values, skipna=skipna) + + @Substitution(name='maximum', shortname='max', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def max(self, axis=None, out=None, skipna=True, level=None): + if level is not None: + return self._agg_by_level('max', level=level, skipna=skipna) + return nanops.nanmax(self.values, skipna=skipna) + + @Substitution(name='standard deviation', shortname='stdev', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def std(self, axis=None, dtype=None, out=None, ddof=1, skipna=True, + level=None): + if level is not None: + return self._agg_by_level('std', level=level, skipna=skipna, + ddof=ddof) + return np.sqrt(nanops.nanvar(self.values, skipna=skipna, ddof=ddof)) + + @Substitution(name='variance', shortname='var', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def var(self, axis=None, dtype=None, out=None, ddof=1, skipna=True, + level=None): + if level is not None: + return self._agg_by_level('var', level=level, skipna=skipna, + ddof=ddof) + return nanops.nanvar(self.values, skipna=skipna, ddof=ddof) + + @Substitution(name='unbiased skewness', shortname='skew', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def skew(self, skipna=True, level=None): + if level is not None: + return self._agg_by_level('skew', level=level, skipna=skipna) + + return nanops.nanskew(self.values, skipna=skipna) + + @Substitution(name='unbiased kurtosis', shortname='kurt', + na_action=_doc_exclude_na, extras='') + @Appender(_stat_doc) + def kurt(self, skipna=True, level=None): + if level is not None: + return self._agg_by_level('kurt', level=level, skipna=skipna) + + return nanops.nankurt(self.values, skipna=skipna) + + def _agg_by_level(self, name, level=0, skipna=True, **kwds): + grouped = self.groupby(level=level) + if hasattr(grouped, name) and skipna: + return getattr(grouped, name)(**kwds) + method = getattr(type(self), name) + applyf = lambda x: method(x, skipna=skipna, **kwds) + return grouped.aggregate(applyf) + + def idxmin(self, axis=None, out=None, skipna=True): + """ + Index of first occurence of minimum of values. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + idxmin : Index of mimimum of values + """ + i = nanops.nanargmin(self.values, skipna=skipna) + if i == -1: + return np.nan + return self.index[i] + + def idxmax(self, axis=None, out=None, skipna=True): + """ + Index of first occurence of maximum of values. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + idxmax : Index of mimimum of values + """ + i = nanops.nanargmax(self.values, skipna=skipna) + if i == -1: + return np.nan + return self.index[i] + + def cumsum(self, axis=0, dtype=None, out=None, skipna=True): + """ + Cumulative sum of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + cumsum : Series + """ + arr = self.values.copy() + + do_mask = skipna and not issubclass(self.dtype.type, np.integer) + if do_mask: + mask = isnull(arr) + np.putmask(arr, mask, 0.) + + result = arr.cumsum() + + if do_mask: + np.putmask(result, mask, np.nan) + + return Series(result, index=self.index) + + def cumprod(self, axis=0, dtype=None, out=None, skipna=True): + """ + Cumulative product of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + cumprod : Series + """ + arr = self.values.copy() + + do_mask = skipna and not issubclass(self.dtype.type, np.integer) + if do_mask: + mask = isnull(arr) + np.putmask(arr, mask, 1.) + + result = arr.cumprod() + + if do_mask: + np.putmask(result, mask, np.nan) + + return Series(result, index=self.index) + + def cummax(self, axis=0, dtype=None, out=None, skipna=True): + """ + Cumulative max of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + cummax : Series + """ + arr = self.values.copy() + + do_mask = skipna and not issubclass(self.dtype.type, np.integer) + if do_mask: + mask = isnull(arr) + np.putmask(arr, mask, -np.inf) + + result = np.maximum.accumulate(arr) + + if do_mask: + np.putmask(result, mask, np.nan) + + return Series(result, index=self.index) + + def cummin(self, axis=0, dtype=None, out=None, skipna=True): + """ + Cumulative min of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values + + Returns + ------- + cummin : Series + """ + arr = self.values.copy() + + do_mask = skipna and not issubclass(self.dtype.type, np.integer) + if do_mask: + mask = isnull(arr) + np.putmask(arr, mask, np.inf) + + result = np.minimum.accumulate(arr) + + if do_mask: + np.putmask(result, mask, np.nan) + + return Series(result, index=self.index) + + @Appender(np.ndarray.round.__doc__) + def round(self, decimals=0, out=None): + """ + + """ + result = self.values.round(decimals, out=out) + if out is None: + result = Series(result, index=self.index, name=self.name) + + return result + + def quantile(self, q=0.5): + """ + Return value at the given quantile, a la scoreatpercentile in + scipy.stats + + Parameters + ---------- + q : quantile + 0 <= q <= 1 + + Returns + ------- + quantile : float + """ + valid_values = self.dropna().values + if len(valid_values) == 0: + return np.nan + return _quantile(valid_values, q * 100) + + def ptp(self, axis=None, out=None): + return self.values.ptp(axis, out) + + def describe(self, percentile_width=50): + """ + Generate various summary statistics of Series, excluding NaN + values. These include: count, mean, std, min, max, and + lower%/50%/upper% percentiles + + Parameters + ---------- + percentile_width : float, optional + width of the desired uncertainty interval, default is 50, + which corresponds to lower=25, upper=75 + + Returns + ------- + desc : Series + """ + try: + from collections import Counter + except ImportError: # pragma: no cover + # For Python < 2.7, we include a local copy of this: + from pandas.util.counter import Counter + + if self.dtype == object: + names = ['count', 'unique', 'top', 'freq'] + + objcounts = Counter(self.dropna().values) + top, freq = objcounts.most_common(1)[0] + data = [self.count(), len(objcounts), top, freq] + + elif issubclass(self.dtype.type, np.datetime64): + names = ['count', 'unique', 'first', 'last', 'top', 'freq'] + + asint = self.dropna().view('i8') + objcounts = Counter(asint) + top, freq = objcounts.most_common(1)[0] + data = [self.count(), len(objcounts), + lib.Timestamp(asint.min()), + lib.Timestamp(asint.max()), + lib.Timestamp(top), freq] + else: + + lb = .5 * (1. - percentile_width/100.) + ub = 1. - lb + + def pretty_name(x): + x *= 100 + if x == int(x): + return '%.0f%%' % x + else: + return '%.1f%%' % x + + names = ['count', 'mean', 'std', 'min', + pretty_name(lb), '50%', pretty_name(ub), + 'max'] + + data = [self.count(), self.mean(), self.std(), self.min(), + self.quantile(lb), self.median(), self.quantile(ub), + self.max()] + + return Series(data, index=names) + + def corr(self, other, method='pearson'): + """ + Compute correlation two Series, excluding missing values + + Parameters + ---------- + other : Series + method : {'pearson', 'kendall', 'spearman'} + pearson : standard correlation coefficient + kendall : Kendall Tau correlation coefficient + spearman : Spearman rank correlation + + Returns + ------- + correlation : float + """ + this, other = self.align(other, join='inner', copy=False) + return nanops.nancorr(this.values, other.values, method=method) + + def cov(self, other): + """ + Compute covariance with Series, excluding missing values + + Parameters + ---------- + other : Series + + Returns + ------- + covariance : float + """ + this, other = self.align(other, join='inner') + if len(this) == 0: + return np.nan + return nanops.nancov(this.values, other.values) + + def diff(self, periods=1): + """ + 1st discrete difference of object + + Parameters + ---------- + periods : int, default 1 + Periods to shift for forming difference + + Returns + ------- + diffed : Series + """ + return (self - self.shift(periods)) + + def autocorr(self): + """ + Lag-1 autocorrelation + + Returns + ------- + autocorr : float + """ + return self.corr(self.shift(1)) + + def clip(self, lower=None, upper=None, out=None): + """ + Trim values at input threshold(s) + + Parameters + ---------- + lower : float, default None + upper : float, default None + + Returns + ------- + clipped : Series + """ + if out is not None: # pragma: no cover + raise Exception('out argument is not supported yet') + + result = self + if lower is not None: + result = result.clip_lower(lower) + if upper is not None: + result = result.clip_upper(upper) + + return result + + def clip_upper(self, threshold): + """ + Return copy of series with values above given value truncated + + See also + -------- + clip + + Returns + ------- + clipped : Series + """ + return np.where(self > threshold, threshold, self) + + def clip_lower(self, threshold): + """ + Return copy of series with values below given value truncated + + See also + -------- + clip + + Returns + ------- + clipped : Series + """ + return np.where(self < threshold, threshold, self) + +#------------------------------------------------------------------------------- +# Combination + + def append(self, to_append, verify_integrity=False): + """ + Concatenate two or more Series. The indexes must not overlap + + Parameters + ---------- + to_append : Series or list/tuple of Series + verify_integrity : boolean, default False + If True, raise Exception on creating index with duplicates + + Returns + ------- + appended : Series + """ + from pandas.tools.merge import concat + if isinstance(to_append, (list, tuple)): + to_concat = [self] + to_append + else: + to_concat = [self, to_append] + return concat(to_concat, ignore_index=False, + verify_integrity=verify_integrity) + + def _binop(self, other, func, level=None, fill_value=None): + """ + Perform generic binary operation with optional fill value + + Parameters + ---------- + other : Series + func : binary operator + fill_value : float or object + Value to substitute for NA/null values. If both Series are NA in a + location, the result will be NA regardless of the passed fill value + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + + Returns + ------- + combined : Series + """ + assert(isinstance(other, Series)) + + new_index = self.index + this = self + + if not self.index.equals(other.index): + this, other = self.align(other, level=level, join='outer') + new_index = this.index + + this_vals = this.values + other_vals = other.values + + if fill_value is not None: + this_mask = isnull(this_vals) + other_mask = isnull(other_vals) + this_vals = this_vals.copy() + other_vals = other_vals.copy() + + # one but not both + mask = this_mask ^ other_mask + this_vals[this_mask & mask] = fill_value + other_vals[other_mask & mask] = fill_value + + result = func(this_vals, other_vals) + name = _maybe_match_name(self, other) + return Series(result, index=new_index, name=name) + + add = _flex_method(operator.add, 'add') + sub = _flex_method(operator.sub, 'subtract') + mul = _flex_method(operator.mul, 'multiply') + try: + div = _flex_method(operator.div, 'divide') + except AttributeError: # pragma: no cover + # Python 3 + div = _flex_method(operator.truediv, 'divide') + + def combine(self, other, func, fill_value=nan): + """ + Perform elementwise binary operation on two Series using given function + with optional fill value when an index is missing from one Series or + the other + + Parameters + ---------- + other : Series or scalar value + func : function + fill_value : scalar value + + Returns + ------- + result : Series + """ + if isinstance(other, Series): + new_index = self.index + other.index + new_name = _maybe_match_name(self, other) + new_values = np.empty(len(new_index), dtype=self.dtype) + for i, idx in enumerate(new_index): + lv = self.get(idx, fill_value) + rv = other.get(idx, fill_value) + new_values[i] = func(lv, rv) + else: + new_index = self.index + new_values = func(self.values, other) + new_name = self.name + return Series(new_values, index=new_index, name=new_name) + + def combine_first(self, other): + """ + Combine Series values, choosing the calling Series's values + first. Result index will be the union of the two indexes + + Parameters + ---------- + other : Series + + Returns + ------- + y : Series + """ + new_index = self.index + other.index + this = self.reindex(new_index, copy=False) + other = other.reindex(new_index, copy=False) + name = _maybe_match_name(self, other) + return Series(np.where(isnull(this), other, this), index=new_index, + name=name) + + def update(self, other): + """ + Modify Series in place using non-NA values from passed + Series. Aligns on index + + Parameters + ---------- + other : Series + """ + other = other.reindex_like(self) + mask = notnull(other) + np.putmask(self.values, mask, other.values) + + #---------------------------------------------------------------------- + # Reindexing, sorting + + def sort(self, axis=0, kind='quicksort', order=None): + """ + Sort values and index labels by value, in place. For compatibility with + ndarray API. No return value + + Parameters + ---------- + axis : int (can only be zero) + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorithm + order : ignored + """ + sortedSeries = self.order(na_last=True, kind=kind) + + true_base = self + while true_base.base is not None: + true_base = true_base.base + + if (true_base is not None and + (true_base.ndim != 1 or true_base.shape != self.shape)): + raise Exception('This Series is a view of some other array, to ' + 'sort in-place you must create a copy') + + self[:] = sortedSeries + self.index = sortedSeries.index + + def sort_index(self, ascending=True): + """ + Sort object by labels (along an axis) + + Parameters + ---------- + ascending : boolean, default True + Sort ascending vs. descending + + Returns + ------- + sorted_obj : Series + """ + new_labels, indexer = self.index.order(return_indexer=True, + ascending=ascending) + new_values = self.values.take(indexer) + return Series(new_values, new_labels, name=self.name) + + def argsort(self, axis=0, kind='quicksort', order=None): + """ + Overrides ndarray.argsort. Argsorts the value, omitting NA/null values, + and places the result in the same locations as the non-NA values + + Parameters + ---------- + axis : int (can only be zero) + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'quicksort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorithm + order : ignored + + Returns + ------- + argsorted : Series + """ + values = self.values + mask = isnull(values) + + if mask.any(): + result = values.copy() + notmask = -mask + result[notmask] = np.argsort(values[notmask], kind=kind) + return Series(result, index=self.index, name=self.name) + else: + return Series(np.argsort(values, kind=kind), index=self.index, + name=self.name) + + def rank(self, method='average', na_option='keep', ascending=True): + """ + Compute data ranks (1 through n). Equal values are assigned a rank that + is the average of the ranks of those values + + Parameters + ---------- + method : {'average', 'min', 'max', 'first'} + average: average rank of group + min: lowest rank in group + max: highest rank in group + first: ranks assigned in order they appear in the array + na_option : {'keep'} + keep: leave NA values where they are + ascending : boolean, default True + False for ranks by high (1) to low (N) + + Returns + ------- + ranks : Series + """ + from pandas.core.algorithms import rank + ranks = rank(self.values, method=method, na_option=na_option, + ascending=ascending) + return Series(ranks, index=self.index, name=self.name) + + def order(self, na_last=True, ascending=True, kind='mergesort'): + """ + Sorts Series object, by value, maintaining index-value link + + Parameters + ---------- + na_last : boolean (optional, default=True) + Put NaN's at beginning or end + ascending : boolean, default True + Sort ascending. Passing False sorts descending + kind : {'mergesort', 'quicksort', 'heapsort'}, default 'mergesort' + Choice of sorting algorithm. See np.sort for more + information. 'mergesort' is the only stable algorith + + Returns + ------- + y : Series + """ + def _try_mergesort(arr): + # easier to ask forgiveness than permission + try: + return arr.argsort(kind='mergesort') + except TypeError: + # stable sort not available for object dtype + return arr.argsort() + + arr = self.values + sortedIdx = np.empty(len(self), dtype=np.int32) + + bad = isnull(arr) + + good = -bad + idx = np.arange(len(self)) + + argsorted = _try_mergesort(arr[good]) + + if not ascending: + argsorted = argsorted[::-1] + + if na_last: + n = good.sum() + sortedIdx[:n] = idx[good][argsorted] + sortedIdx[n:] = idx[bad] + else: + n = bad.sum() + sortedIdx[n:] = idx[good][argsorted] + sortedIdx[:n] = idx[bad] + + return Series(arr[sortedIdx], index=self.index[sortedIdx], + name=self.name) + + def sortlevel(self, level=0, ascending=True): + """ + Sort Series with MultiIndex by chosen level. Data will be + lexicographically sorted by the chosen level followed by the other + levels (in order) + + Parameters + ---------- + level : int + ascending : bool, default True + + Returns + ------- + sorted : Series + """ + if not isinstance(self.index, MultiIndex): + raise Exception('can only sort by level with a hierarchical index') + + new_index, indexer = self.index.sortlevel(level, ascending=ascending) + new_values = self.values.take(indexer) + return Series(new_values, index=new_index, name=self.name) + + def swaplevel(self, i, j, copy=True): + """ + Swap levels i and j in a MultiIndex + + Returns + ------- + swapped : Series + """ + new_index = self.index.swaplevel(i, j) + return Series(self.values, index=new_index, copy=copy, name=self.name) + + def reorder_levels(self, order): + """ + Rearrange index levels using input order. May not drop or duplicate + levels + + Parameters + ---------- + order: list of int representing new level order. + (reference level by number not by key) + axis: where to reorder levels + + Returns + ------- + type of caller (new object) + """ + if not isinstance(self.index, MultiIndex): # pragma: no cover + raise Exception('Can only reorder levels on a hierarchical axis.') + + result = self.copy() + result.index = result.index.reorder_levels(order) + return result + + def unstack(self, level=-1): + """ + Unstack, a.k.a. pivot, Series with MultiIndex to produce DataFrame + + Parameters + ---------- + level : int, string, or list of these, default last level + Level(s) to unstack, can pass level name + + Examples + -------- + >>> s + one a 1. + one b 2. + two a 3. + two b 4. + + >>> s.unstack(level=-1) + a b + one 1. 2. + two 3. 4. + + >>> s.unstack(level=0) + one two + a 1. 2. + b 3. 4. + + Returns + ------- + unstacked : DataFrame + """ + from pandas.core.reshape import unstack + return unstack(self, level) + + #---------------------------------------------------------------------- + # function application + + def map(self, arg): + """ + Map values of Series using input correspondence (which can be + a dict, Series, or function) + + Parameters + ---------- + arg : function, dict, or Series + + Examples + -------- + >>> x + one 1 + two 2 + three 3 + + >>> y + 1 foo + 2 bar + 3 baz + + >>> x.map(y) + one foo + two bar + three baz + + Returns + ------- + y : Series + same index as caller + """ + if isinstance(arg, (dict, Series)): + if isinstance(arg, dict): + arg = Series(arg) + + indexer = arg.index.get_indexer(self.values) + new_values = com.take_1d(arg.values, indexer) + return Series(new_values, index=self.index, name=self.name) + else: + mapped = lib.map_infer(self.values, arg) + return Series(mapped, index=self.index, name=self.name) + + def apply(self, func, convert_dtype=True): + """ + Invoke function on values of Series. Can be ufunc or Python function + expecting only single values + + Parameters + ---------- + func : function + convert_dtype : boolean, default True + Try to find better dtype for elementwise function results. If + False, leave as dtype=object + + See also + -------- + Series.map: For element-wise operations + + Returns + ------- + y : Series + """ + try: + result = func(self) + if isinstance(result, np.ndarray): + result = Series(result, index=self.index, name=self.name) + else: + raise ValueError('Must yield array') + return result + except Exception: + mapped = lib.map_infer(self.values, func, convert=convert_dtype) + return Series(mapped, index=self.index, name=self.name) + + def align(self, other, join='outer', level=None, copy=True, + fill_value=None, method=None, inplace=False, limit=None): + """ + Align two Series object with the specified join method + + Parameters + ---------- + other : Series + join : {'outer', 'inner', 'left', 'right'}, default 'outer' + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + copy : boolean, default True + Always return new objects. If copy=False and no reindexing is + required, the same object will be returned (for better performance) + fill_value : object, default None + method : str, default 'pad' + limit : int, default None + fill_value, method, inplace, limit are passed to fillna + + Returns + ------- + (left, right) : (Series, Series) + Aligned Series + """ + join_index, lidx, ridx = self.index.join(other.index, how=join, + level=level, + return_indexers=True) + + left = self._reindex_indexer(join_index, lidx, copy) + right = other._reindex_indexer(join_index, ridx, copy) + fill_na = (fill_value is not None) or (method is not None) + if fill_na: + return (left.fillna(fill_value, method=method, limit=limit), + right.fillna(fill_value, method=method, limit=limit)) + else: + return left, right + + def _reindex_indexer(self, new_index, indexer, copy): + if indexer is not None: + new_values = com.take_1d(self.values, indexer) + else: + if copy: + result = self.copy() + else: + result = self + return result + + # be subclass-friendly + return self._constructor(new_values, new_index, name=self.name) + + def reindex(self, index=None, method=None, level=None, fill_value=np.nan, + limit=None, copy=True): + """Conform Series to new index with optional filling logic, placing + NA/NaN in locations having no value in the previous index. A new object + is produced unless the new index is equivalent to the current one and + copy=False + + Parameters + ---------- + index : array-like or Index + New labels / index to conform to. Preferably an Index object to + avoid duplicating data + method : {'backfill', 'bfill', 'pad', 'ffill', None} + Method to use for filling holes in reindexed Series + pad / ffill: propagate LAST valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + copy : boolean, default True + Return a new object, even if the passed indexes are the same + level : int or name + Broadcast across a level, matching Index values on the + passed MultiIndex level + fill_value : scalar, default np.NaN + Value to use for missing values. Defaults to NaN, but can be any + "compatible" value + limit : int, default None + Maximum size gap to forward or backward fill + + Returns + ------- + reindexed : Series + """ + if index is None: + raise ValueError('Must pass Index or sequence, not None') + + index = _ensure_index(index) + if self.index.equals(index): + if copy: + result = self.copy() + result.index = index + return result + else: + return self + + if len(self.index) == 0: + return Series(nan, index=index, name=self.name) + + new_index, indexer = self.index.reindex(index, method=method, + level=level, limit=limit) + new_values = com.take_1d(self.values, indexer, fill_value=fill_value) + return Series(new_values, index=new_index, name=self.name) + + def reindex_like(self, other, method=None, limit=None): + """ + Reindex Series to match index of another Series, optionally with + filling logic + + Parameters + ---------- + other : Series + method : string or None + See Series.reindex docstring + limit : int, default None + Maximum size gap to forward or backward fill + + Notes + ----- + Like calling s.reindex(other.index, method=...) + + Returns + ------- + reindexed : Series + """ + return self.reindex(other.index, method=method, limit=limit) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take, return Series corresponding to requested + indices + + Parameters + ---------- + indices : list / array of ints + + Returns + ------- + taken : Series + """ + indices = com._ensure_platform_int(indices) + new_index = self.index.take(indices) + new_values = self.values.take(indices) + return Series(new_values, index=new_index, name=self.name) + + truncate = generic.truncate + + def fillna(self, value=None, method='pad', inplace=False, + limit=None): + """ + Fill NA/NaN values using the specified method + + Parameters + ---------- + value : any kind (should be same type as array) + Value to use to fill holes (e.g. 0) + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + inplace : boolean, default False + If True, fill the Series in place. Note: this will modify any other + views on this Series, for example a column in a DataFrame. Returns + a reference to the filled object, which is self if inplace=True + limit : int, default None + Maximum size gap to forward or backward fill + + See also + -------- + reindex, asfreq + + Returns + ------- + filled : Series + """ + if value is not None: + result = self.copy() if not inplace else self + mask = isnull(self.values) + np.putmask(result, mask, value) + else: + if method is None: # pragma: no cover + raise ValueError('must specify a fill method') + + fill_f = _get_fill_func(method) + + if inplace: + values = self.values + else: + values = self.values.copy() + + fill_f(values, limit=limit) + + if inplace: + result = self + else: + result = Series(values, index=self.index, name=self.name) + + return result + + + def replace(self, to_replace, value=None, method='pad', inplace=False, + limit=None): + """ + Replace arbitrary values in a Series + + Parameters + ---------- + to_replace : list or dict + list of values to be replaced or dict of replacement values + value : anything + if to_replace is a list then value is the replacement value + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad' + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + inplace : boolean, default False + If True, fill the Series in place. Note: this will modify any other + views on this Series, for example a column in a DataFrame. Returns + a reference to the filled object, which is self if inplace=True + limit : int, default None + Maximum size gap to forward or backward fill + + Notes + ----- + replace does not distinguish between NaN and None + + See also + -------- + fillna, reindex, asfreq + + Returns + ------- + replaced : Series + """ + result = self.copy() if not inplace else self + + def _rep_one(s, to_rep, v): # replace single value + mask = com.mask_missing(s.values, to_rep) + np.putmask(s.values, mask, v) + return s + + def _rep_dict(rs, to_rep): # replace {[src] -> dest} + + all_src = set() + dd = {} # group by unique destination value + for s, d in to_rep.iteritems(): + dd.setdefault(d, []).append(s) + all_src.add(s) + + if any(d in all_src for d in dd.keys()): + # don't clobber each other at the cost of temporaries + masks = {} + for d, sset in dd.iteritems(): # now replace by each dest + masks[d] = com.mask_missing(rs.values, sset) + + for d, m in masks.iteritems(): + np.putmask(rs.values, m, d) + else: # if no risk of clobbering then simple + for d, sset in dd.iteritems(): + _rep_one(rs, sset, d) + return rs + + if np.isscalar(to_replace): + to_replace = [to_replace] + + if isinstance(to_replace, dict): + return _rep_dict(result, to_replace) + + if isinstance(to_replace, (list, np.ndarray)): + + if isinstance(value, (list, np.ndarray)): # check same length + vl, rl = len(value), len(to_replace) + if vl == rl: + return _rep_dict(result, dict(zip(to_replace, value))) + raise ValueError('Got %d to replace but %d values' % (rl, vl)) + + elif value is not None: # otherwise all replaced with same value + + return _rep_one(result, to_replace, value) + + else: # method + if method is None: # pragma: no cover + raise ValueError('must specify a fill method') + fill_f = _get_fill_func(method) + + mask = com.mask_missing(result, to_replace) + fill_f(result.values, limit=limit, mask=mask) + + if not inplace: + result = Series(result.values, index=self.index, + name=self.name) + return result + + + raise ValueError('Unrecognized to_replace type %s' % + type(to_replace)) + + def isin(self, values): + """ + Return boolean vector showing whether each element in the Series is + exactly contained in the passed sequence of values + + Parameters + ---------- + values : sequence + + Returns + ------- + isin : Series (boolean dtype) + """ + value_set = set(values) + result = lib.ismember(self.values, value_set) + return Series(result, self.index, name=self.name) + + def between(self, left, right, inclusive=True): + """ + Return boolean Series equivalent to left <= series <= right. NA values + will be treated as False + + Parameters + ---------- + left : scalar + Left boundary + right : scalar + Right boundary + + Returns + ------- + is_between : Series + """ + if inclusive: + lmask = self >= left + rmask = self <= right + else: + lmask = self > left + rmask = self < right + + return lmask & rmask + + @classmethod + def from_csv(cls, path, sep=',', parse_dates=True, header=None, + index_col=0, encoding=None): + """ + Read delimited file into Series + + Parameters + ---------- + path : string file path or file handle / StringIO + sep : string, default ',' + Field delimiter + parse_dates : boolean, default True + Parse dates. Different default from read_table + header : int, default 0 + Row to use at header (skip prior rows) + index_col : int or sequence, default 0 + Column to use for index. If a sequence is given, a MultiIndex + is used. Different default from read_table + encoding : string, optional + a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + + Returns + ------- + y : Series + """ + from pandas.core.frame import DataFrame + df = DataFrame.from_csv(path, header=header, index_col=index_col, + sep=sep, parse_dates=parse_dates, + encoding=encoding) + result = df.ix[:, 0] + result.index.name = result.name = None + return result + + def to_csv(self, path, index=True, sep=",", na_rep='', header=False, + index_label=None, mode='w', nanRep=None, encoding=None): + """ + Write Series to a comma-separated values (csv) file + + Parameters + ---------- + path : string file path or file handle / StringIO + na_rep : string, default '' + Missing data rep'n + header : boolean, default False + Write out series name + index : boolean, default True + Write row names (index) + index_label : string or sequence, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the DataFrame uses MultiIndex. + mode : Python write mode, default 'w' + sep : character, default "," + Field delimiter for the output file. + encoding : string, optional + a string representing the encoding to use if the contents are + non-ascii, for python versions prior to 3 + """ + from pandas.core.frame import DataFrame + df = DataFrame(self) + df.to_csv(path, index=index, sep=sep, na_rep=na_rep, header=header, + index_label=index_label, mode=mode, nanRep=nanRep, + encoding=encoding) + + def dropna(self): + """ + Return Series without null values + + Returns + ------- + valid : Series + """ + return remove_na(self) + + valid = lambda self: self.dropna() + + isnull = isnull + notnull = notnull + + def first_valid_index(self): + """ + Return label for first non-NA/null value + """ + if len(self) == 0: + return None + + mask = isnull(self.values) + i = mask.argmin() + if mask[i]: + return None + else: + return self.index[i] + + def last_valid_index(self): + """ + Return label for last non-NA/null value + """ + if len(self) == 0: + return None + + mask = isnull(self.values[::-1]) + i = mask.argmin() + if mask[i]: + return None + else: + return self.index[len(self) - i - 1] + + #---------------------------------------------------------------------- + # Time series-oriented methods + + def shift(self, periods=1, freq=None, **kwds): + """ + Shift the index of the Series by desired number of periods with an + optional time offset + + Parameters + ---------- + periods : int + Number of periods to move, can be positive or negative + freq : DateOffset, timedelta, or offset alias string, optional + Increment to use from datetools module or time rule (e.g. 'EOM') + + Returns + ------- + shifted : Series + """ + if periods == 0: + return self.copy() + + offset = _resolve_offset(freq, kwds) + + if isinstance(offset, basestring): + offset = datetools.to_offset(offset) + + if offset is None: + new_values = np.empty(len(self), dtype=self.dtype) + new_values = _maybe_upcast(new_values) + + if periods > 0: + new_values[periods:] = self.values[:-periods] + new_values[:periods] = nan + elif periods < 0: + new_values[:periods] = self.values[-periods:] + new_values[periods:] = nan + + return Series(new_values, index=self.index, name=self.name) + elif isinstance(self.index, PeriodIndex): + orig_offset = datetools.to_offset(self.index.freq) + if orig_offset == offset: + return Series(self, self.index.shift(periods), name=self.name) + msg = ('Given freq %s does not match PeriodIndex freq %s' % + (offset.rule_code, orig_offset.rule_code)) + raise ValueError(msg) + else: + return Series(self, index=self.index.shift(periods, offset), + name=self.name) + + def asof(self, where): + """ + Return last good (non-NaN) value in TimeSeries if value is NaN for + requested date. + + If there is no good value, NaN is returned. + + Parameters + ---------- + wehre : date or array of dates + + Notes + ----- + Dates are assumed to be sorted + + Returns + ------- + value or NaN + """ + if isinstance(where, basestring): + where = datetools.to_datetime(where) + + values = self.values + + if not hasattr(where, '__iter__'): + if where < self.index[0]: + return np.nan + loc = self.index.searchsorted(where, side='right') + if loc > 0: + loc -= 1 + while isnull(values[loc]) and loc > 0: + loc -= 1 + return values[loc] + + if not isinstance(where, Index): + where = Index(where) + + locs = self.index.asof_locs(where, notnull(values)) + new_values = com.take_1d(values, locs) + return Series(new_values, index=where, name=self.name) + + def interpolate(self, method='linear'): + """ + Interpolate missing values (after the first valid value) + + Parameters + ---------- + method : {'linear', 'time', 'values'} + Interpolation method. + 'time' interpolation works on daily and higher resolution + data to interpolate given length of interval + 'values' using the actual index numeric values + + Returns + ------- + interpolated : Series + """ + if method == 'time': + if not isinstance(self, TimeSeries): + raise Exception('time-weighted interpolation only works' + 'on TimeSeries') + inds = np.array([d.toordinal() for d in self.index]) + elif method == 'values': + inds = self.index.values + if inds.dtype == np.object_: + inds = lib.maybe_convert_objects(inds) + else: + inds = np.arange(len(self)) + + values = self.values + + invalid = isnull(values) + valid = -invalid + + firstIndex = valid.argmax() + valid = valid[firstIndex:] + invalid = invalid[firstIndex:] + inds = inds[firstIndex:] + + result = values.copy() + result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid], + values[firstIndex:][valid]) + + return Series(result, index=self.index, name=self.name) + + def rename(self, mapper, inplace=False): + """ + Alter Series index using dict or function + + Parameters + ---------- + mapper : dict-like or function + Transformation to apply to each index + + Notes + ----- + Function / dict values must be unique (1-to-1) + + Examples + -------- + >>> x + foo 1 + bar 2 + baz 3 + + >>> x.rename(str.upper) + FOO 1 + BAR 2 + BAZ 3 + + >>> x.rename({'foo' : 'a', 'bar' : 'b', 'baz' : 'c'}) + a 1 + b 2 + c 3 + + Returns + ------- + renamed : Series (new object) + """ + mapper_f = _get_rename_function(mapper) + result = self if inplace else self.copy() + result.index = [mapper_f(x) for x in self.index] + + return result + + @property + def weekday(self): + return Series([d.weekday() for d in self.index], index=self.index) + + def tz_convert(self, tz, copy=True): + """ + Convert TimeSeries to target time zone + + Parameters + ---------- + tz : string or pytz.timezone object + copy : boolean, default True + Also make a copy of the underlying data + + Returns + ------- + converted : TimeSeries + """ + new_index = self.index.tz_convert(tz) + + new_values = self.values + if copy: + new_values = new_values.copy() + + return Series(new_values, index=new_index, name=self.name) + + def tz_localize(self, tz, copy=True): + """ + Localize tz-naive TimeSeries to target time zone + + Parameters + ---------- + tz : string or pytz.timezone object + copy : boolean, default True + Also make a copy of the underlying data + + Returns + ------- + localized : TimeSeries + """ + new_index = self.index.tz_localize(tz) + + new_values = self.values + if copy: + new_values = new_values.copy() + + return Series(new_values, index=new_index, name=self.name) + + +_INDEX_TYPES = ndarray, Index, list, tuple + +#------------------------------------------------------------------------------- +# Supplementary functions + +def remove_na(arr): + """ + Return array containing only true/non-NaN values, possibly empty. + """ + return arr[notnull(arr)] + + +def _sanitize_array(data, index, dtype=None, copy=False, + raise_cast_failure=False): + if isinstance(data, ma.MaskedArray): + mask = ma.getmaskarray(data) + data = ma.copy(data) + data[mask] = np.nan + + def _try_cast(arr): + try: + subarr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError): + if dtype is not None and raise_cast_failure: + raise + else: # pragma: no cover + subarr = np.array(data, dtype=object, copy=copy) + return subarr + + # GH #846 + if isinstance(data, np.ndarray): + subarr = data + if dtype is not None: + + # possibility of nan -> garbage + if com.is_float_dtype(data.dtype) and com.is_integer_dtype(dtype): + if not isnull(data).any(): + subarr = _try_cast(data) + elif copy: + subarr = data.copy() + else: + if (com.is_datetime64_dtype(data.dtype) and + not com.is_datetime64_dtype(dtype)): + if dtype == object: + ints = np.asarray(data).view('i8') + subarr = lib.ints_to_pydatetime(ints) + elif raise_cast_failure: + raise TypeError('Cannot cast datetime64 to %s' % dtype) + else: + subarr = _try_cast(data) + elif copy: + subarr = data.copy() + elif isinstance(data, list) and len(data) > 0: + if dtype is not None: + try: + subarr = _try_cast(data) + except Exception: + if raise_cast_failure: # pragma: no cover + raise + subarr = np.array(data, dtype=object, copy=copy) + subarr = lib.maybe_convert_objects(subarr) + else: + subarr = lib.list_to_object_array(data) + subarr = lib.maybe_convert_objects(subarr) + else: + subarr = _try_cast(data) + + if subarr.ndim == 0: + if isinstance(data, list): # pragma: no cover + subarr = np.array(data, dtype=object) + elif index is not None: + value = data + + # If we create an empty array using a string to infer + # the dtype, NumPy will only allocate one character per entry + # so this is kind of bad. Alternately we could use np.repeat + # instead of np.empty (but then you still don't want things + # coming out as np.str_! + if isinstance(value, basestring) and dtype is None: + dtype = np.object_ + + if dtype is None: + subarr = np.empty(len(index), dtype=type(value)) + else: + subarr = np.empty(len(index), dtype=dtype) + subarr.fill(value) + else: + return subarr.item() + elif subarr.ndim > 1: + if isinstance(data, np.ndarray): + raise Exception('Data must be 1-dimensional') + else: + subarr = _asarray_tuplesafe(data, dtype=dtype) + + # This is to prevent mixed-type Series getting all casted to + # NumPy string type, e.g. NaN --> '-1#IND'. + if issubclass(subarr.dtype.type, basestring): + subarr = np.array(data, dtype=object, copy=copy) + + return subarr + +def _get_rename_function(mapper): + if isinstance(mapper, (dict, Series)): + def f(x): + if x in mapper: + return mapper[x] + else: + return x + else: + f = mapper + + return f + +def _resolve_offset(freq, kwds): + from pandas.core.datetools import getOffset + + if 'timeRule' in kwds or 'offset' in kwds: + offset = kwds.get('offset', None) + offset = kwds.get('timeRule', offset) + if isinstance(offset, basestring): + offset = datetools.getOffset(offset) + warn = True + else: + offset = freq + warn = False + + if warn and _SHOW_WARNINGS: # pragma: no cover + import warnings + warnings.warn("'timeRule' and 'offset' parameters are deprecated," + " please use 'freq' instead", + FutureWarning) + + return offset + +def _get_fill_func(method): + method = com._clean_fill_method(method) + if method == 'pad': + fill_f = com.pad_1d + elif method == 'backfill': + fill_f = com.backfill_1d + return fill_f + +#---------------------------------------------------------------------- +# Add plotting methods to Series + +import pandas.tools.plotting as _gfx + +Series.plot = _gfx.plot_series +Series.hist = _gfx.hist_series + +# Put here, otherwise monkey-patching in methods fails + +class TimeSeries(Series): + + def _repr_footer(self): + if self.index.freq is not None: + freqstr = 'Freq: %s, ' % self.index.freqstr + else: + freqstr = '' + + namestr = "Name: %s, " % str(self.name) if self.name else "" + return '%s%sLength: %d' % (freqstr, namestr, len(self)) + + def at_time(self, time, asof=False): + """ + Select values at particular time of day (e.g. 9:30AM) + + Parameters + ---------- + time : datetime.time or string + + Returns + ------- + values_at_time : TimeSeries + """ + indexer = self.index.indexer_at_time(time, asof=asof) + return self.take(indexer) + + def between_time(self, start_time, end_time, include_start=True, + include_end=True): + """ + Select values between particular times of the day (e.g., 9:00-9:30 AM) + + Parameters + ---------- + start_time : datetime.time or string + end_time : datetime.time or string + include_start : boolean, default True + include_end : boolean, default True + + Returns + ------- + values_between_time : TimeSeries + """ + indexer = self.index.indexer_between_time( + start_time, end_time, include_start=include_start, + include_end=include_end) + return self.take(indexer) + + def to_timestamp(self, freq=None, how='start', copy=True): + """ + Cast to datetimeindex of timestamps, at *beginning* of period + + Parameters + ---------- + freq : string, default frequency of PeriodIndex + Desired frequency + how : {'s', 'e', 'start', 'end'} + Convention for converting period to timestamp; start of period + vs. end + + Returns + ------- + ts : TimeSeries with DatetimeIndex + """ + new_values = self.values + if copy: + new_values = new_values.copy() + + new_index = self.index.to_timestamp(freq=freq, how=how) + return Series(new_values, index=new_index, name=self.name) + + def to_period(self, freq=None, copy=True): + """ + Convert TimeSeries from DatetimeIndex to PeriodIndex with desired + frequency (inferred from index if not passed) + + Parameters + ---------- + freq : string, default + + Returns + ------- + ts : TimeSeries with PeriodIndex + """ + new_values = self.values + if copy: + new_values = new_values.copy() + + if freq is None: + freq = self.index.freqstr or self.index.inferred_freq + new_index = self.index.to_period(freq=freq) + return Series(new_values, index=new_index, name=self.name) diff --git a/pandas/core/sparse.py b/pandas/core/sparse.py new file mode 100644 index 00000000..1405e88a --- /dev/null +++ b/pandas/core/sparse.py @@ -0,0 +1,10 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only with +float64 data +""" + +# pylint: disable=W0611 + +from pandas.sparse.series import SparseSeries, SparseTimeSeries +from pandas.sparse.frame import SparseDataFrame +from pandas.sparse.panel import SparsePanel diff --git a/pandas/core/strings.py b/pandas/core/strings.py new file mode 100644 index 00000000..14939e3e --- /dev/null +++ b/pandas/core/strings.py @@ -0,0 +1,31 @@ +import numpy as np +from pandas.util.map import mapwrap, auto_map +import re + +startswith = mapwrap(str.startswith) +contains = mapwrap(str.__contains__) +upper = mapwrap(str.upper) +lower = mapwrap(str.lower) + +def _re_get_groups(pattern, n): + def inner(s, *groups): + m = pattern.search(s) + if m: + return m.group(*[int(g) for g in groups]) + return np.nan if n == 1 else [np.nan] * n + + return inner + +def search_re(arr, pattern, groups=(0,)): + if isinstance(pattern, str): + pattern = re.compile(pattern) + + if isinstance(groups, np.ndarray): + if groups.ndim == 1: + n_groups = 1 + else: + n_groups = groups.shape[1] + else: + n_groups = len(groups) + + return auto_map(arr, _re_get_groups(pattern, n_groups), (groups,), n_results=n_groups) diff --git a/pandas/info.py b/pandas/info.py new file mode 100644 index 00000000..754741c1 --- /dev/null +++ b/pandas/info.py @@ -0,0 +1,20 @@ +""" +pandas - a powerful data analysis and manipulation library for Python +===================================================================== + +See http://pandas.sourceforge.net for full documentation. Otherwise, see the +docstrings of the various objects in the pandas namespace: + +Series +DataFrame +Panel +Index +DatetimeIndex +HDFStore +bdate_range +date_range +read_csv +read_fwf +read_table +ols +""" diff --git a/pandas/io/__init__.py b/pandas/io/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/io/data.py b/pandas/io/data.py new file mode 100644 index 00000000..a81b0c7c --- /dev/null +++ b/pandas/io/data.py @@ -0,0 +1,196 @@ +""" +Module contains tools for collecting data from various remote sources + + +""" + +import numpy as np +import datetime as dt +import urllib +import urllib2 +import time + +from zipfile import ZipFile +from pandas.util.py3compat import StringIO, BytesIO, bytes_to_str + +from pandas import DataFrame, read_csv + +def DataReader(name, data_source=None, start=None, end=None, + retry_count=3, pause=0): + """ + Imports data from a number of online sources. + + Currently supports Yahoo! finance, St. Louis FED (FRED), and Kenneth + French's data library. + + Parameters + ---------- + name : str + the name of the dataset + data_source: str + the data source ("yahoo", "fred", or "ff") + start : {datetime, None} + left boundary for range (defaults to 1/1/2010) + end : {datetime, None} + right boundary for range (defaults to today) + + Examples + ---------- + + # Data from Yahoo! + gs = DataReader("GS", "yahoo") + + # Data from FRED + vix = DataReader("VIXCLS", "fred") + + # Data from Fama/French + ff = DataReader("F-F_Research_Data_Factors", "famafrench") + ff = DataReader("F-F_Research_Data_Factors_weekly", "famafrench") + ff = DataReader("6_Portfolios_2x3", "famafrench") + ff = DataReader("F-F_ST_Reversal_Factor", "famafrench") + """ + start, end = _sanitize_dates(start, end) + + if(data_source == "yahoo"): + return get_data_yahoo(name=name, start=start, end=end, + retry_count=retry_count, pause=pause) + elif(data_source == "fred"): + return get_data_fred(name=name, start=start, end=end) + elif(data_source == "famafrench"): + return get_data_famafrench(name=name) + +def _sanitize_dates(start, end): + from pandas.core.datetools import to_datetime + start = to_datetime(start) + end = to_datetime(end) + if start is None: + start = dt.datetime.today() - dt.timedelta(365) + if end is None: + end = dt.datetime.today() + return start, end + +def get_quote_yahoo(symbols): + """ + Get current yahoo quote + + Returns a DataFrame + """ + if not isinstance(symbols,list): + raise TypeError, "symbols must be a list" + # for codes see: http://www.gummy-stuff.org/Yahoo-data.htm + codes = {'symbol':'s','last':'l1','change_pct':'p2','PE':'r','time':'t1','short_ratio':'s7'} + request = str.join('',codes.values()) # code request string + header = codes.keys() + + data = dict(zip(codes.keys(),[[] for i in range(len(codes))])) + + urlStr = 'http://finance.yahoo.com/d/quotes.csv?s=%s&f=%s' % (str.join('+',symbols), request) + + try: + lines = urllib2.urlopen(urlStr).readlines() + except Exception, e: + s = "Failed to download:\n{0}".format(e); + print s + return None + + for line in lines: + fields = line.strip().split(',') + for i,field in enumerate(fields): + if field[-2:] == '%"': + data[header[i]].append(float(field.strip('"%'))) + elif field[0] == '"': + data[header[i]].append( field.strip('"')) + else: + try: + data[header[i]].append(float(field)) + except ValueError: + data[header[i]].append(np.nan) + + idx = data.pop('symbol') + + return DataFrame(data,index=idx) + +def get_data_yahoo(name=None, start=None, end=None, retry_count=3, pause=0): + """ + Get historical data for the given name from yahoo. + Date format is datetime + + Returns a DataFrame. + """ + start, end = _sanitize_dates(start, end) + + if(name is None): + print "Need to provide a name" + return None + + yahoo_URL = 'http://ichart.yahoo.com/table.csv?' + + url = yahoo_URL + 's=%s' % name + \ + '&a=%s' % (start.month - 1) + \ + '&b=%s' % start.day + \ + '&c=%s' % start.year + \ + '&d=%s' % (end.month - 1) + \ + '&e=%s' % end.day + \ + '&f=%s' % end.year + \ + '&g=d' + \ + '&ignore=.csv' + for i in range(0, retry_count): + resp = urllib2.urlopen(url) + if resp.code == 200: + lines = resp.read() + rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0, + parse_dates=True) + return rs[::-1] + time.sleep(pause) + raise Exception( + "after %d tries, Yahoo did not return a 200 for url %s" % (pause, url)) + + + +def get_data_fred(name=None, start=dt.datetime(2010, 1, 1), + end=dt.datetime.today()): + """ + Get data for the given name from the St. Louis FED (FRED). + Date format is datetime + + Returns a DataFrame. + """ + start, end = _sanitize_dates(start, end) + + if(name is None): + print "Need to provide a name" + return None + + fred_URL = "http://research.stlouisfed.org/fred2/series/" + + url = fred_URL + '%s' % name + \ + '/downloaddata/%s' % name + '.csv' + data = read_csv(urllib.urlopen(url), index_col=0, parse_dates=True) + return data.truncate(start, end) + +def get_data_famafrench(name, start=None, end=None): + start, end = _sanitize_dates(start, end) + + # path of zip files + zipFileURL = "http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/" + + url = urllib.urlopen(zipFileURL + name + ".zip") + zipfile = ZipFile(StringIO(url.read())) + data = zipfile.open(name + ".txt").readlines() + + file_edges = np.where(np.array([len(d) for d in data]) == 2)[0] + + datasets = {} + for i in range(len(file_edges)-1): + dataset = [d.split() for d in data[(file_edges[i] + 1):file_edges[i+1]]] + if(len(dataset) > 10): + ncol = np.median(np.array([len(d) for d in dataset])) + header_index = np.where(np.array([len(d) for d in dataset]) == (ncol-1))[0][-1] + header = dataset[header_index] + # to ensure the header is unique + header = [str(j + 1) + " " + header[j] for j in range(len(header))] + index = np.array([d[0] for d in dataset[(header_index + 1):]], dtype=int) + dataset = np.array([d[1:] for d in dataset[(header_index + 1):]], dtype=float) + datasets[i] = DataFrame(dataset, index, columns=header) + + return datasets diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py new file mode 100644 index 00000000..b9325b97 --- /dev/null +++ b/pandas/io/date_converters.py @@ -0,0 +1,49 @@ +"""This module is designed for community supported date conversion functions""" +import numpy as np +import pandas.lib as lib + +def parse_date_time(date_col, time_col): + date_col = _maybe_cast(date_col) + time_col = _maybe_cast(time_col) + return lib.try_parse_date_and_time(date_col, time_col) + +def parse_date_fields(year_col, month_col, day_col): + year_col = _maybe_cast(year_col) + month_col = _maybe_cast(month_col) + day_col = _maybe_cast(day_col) + return lib.try_parse_year_month_day(year_col, month_col, day_col) + +def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, + second_col): + year_col = _maybe_cast(year_col) + month_col = _maybe_cast(month_col) + day_col = _maybe_cast(day_col) + hour_col = _maybe_cast(hour_col) + minute_col = _maybe_cast(minute_col) + second_col = _maybe_cast(second_col) + return lib.try_parse_datetime_components(year_col, month_col, day_col, + hour_col, minute_col, second_col) + +def generic_parser(parse_func, *cols): + N = _check_columns(cols) + results = np.empty(N, dtype=object) + + for i in xrange(N): + args = [c[i] for c in cols] + results[i] = parse_func(*args) + + return results + +def _maybe_cast(arr): + if not arr.dtype.type == np.object_: + arr = np.array(arr, dtype=object) + return arr + +def _check_columns(cols): + assert(len(cols) > 0) + + N = len(cols[0]) + for c in cols[1:]: + assert(len(c) == N) + + return N diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py new file mode 100644 index 00000000..797e5de7 --- /dev/null +++ b/pandas/io/parsers.py @@ -0,0 +1,1400 @@ +""" +Module contains tools for processing files into DataFrames or other objects +""" +from StringIO import StringIO +import re +from itertools import izip +from urlparse import urlparse +import csv + +try: + next +except NameError: # pragma: no cover + # Python < 2.6 + def next(x): + return x.next() + +import numpy as np + +from pandas.core.index import Index, MultiIndex +from pandas.core.frame import DataFrame +import datetime +import pandas.core.common as com +import pandas.lib as lib +from pandas.util import py3compat +from pandas.io.date_converters import generic_parser + +from pandas.util.decorators import Appender + +class DateConversionError(Exception): + pass + +_parser_params = """Also supports optionally iterating or breaking of the file +into chunks. + +Parameters +---------- +filepath_or_buffer : string or file handle / StringIO. The string could be + a URL. Valid URL schemes include http, ftp, and file. For file URLs, a host + is expected. For instance, a local file could be + file ://localhost/path/to/table.csv +%s +dialect : string or csv.Dialect instance, default None + If None defaults to Excel dialect. Ignored if sep longer than 1 char + See csv.Dialect documentation for more details +header : int, default 0 + Row to use for the column labels of the parsed DataFrame +skiprows : list-like or integer + Row numbers to skip (0-indexed) or number of rows to skip (int) +index_col : int or sequence, default None + Column to use as the row labels of the DataFrame. If a sequence is + given, a MultiIndex is used. +names : array-like + List of column names +na_values : list-like or dict, default None + Additional strings to recognize as NA/NaN. If dict passed, specific + per-column NA values +parse_dates : boolean, list of ints or names, list of lists, or dict + True -> try parsing all columns + [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column + [[1, 3]] -> combine columns 1 and 3 and parse as a single date column + {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' +keep_date_col : boolean, default False + If True and parse_dates specifies combining multiple columns then + keep the original columns. +date_parser : function + Function to use for converting dates to strings. Defaults to + dateutil.parser +dayfirst : boolean, default False + DD/MM format dates, international and European format +thousands : str, default None + Thousands separator +comment : str, default None + Indicates remainder of line should not be parsed + Does not support line commenting (will return empty line) +nrows : int, default None + Number of rows of file to read. Useful for reading pieces of large files +iterator : boolean, default False + Return TextParser object +chunksize : int, default None + Return TextParser object for iteration +skip_footer : int, default 0 + Number of line at bottom of file to skip +converters : dict. optional + Dict of functions for converting values in certain columns. Keys can either + be integers or column labels +verbose : boolean, default False + Indicate number of NA values placed in non-numeric columns +delimiter : string, default None + Alternative argument name for sep. Regular expressions are accepted. +encoding : string, default None + Encoding to use for UTF when reading/writing (ex. 'utf-8') +squeeze : boolean, default False + If the parsed data only contains one column then return a Series + +Returns +------- +result : DataFrame or TextParser +""" + +_csv_sep = """sep : string, default ',' + Delimiter to use. If sep is None, will try to automatically determine + this. Regular expressions are accepted. +""" + +_table_sep = """sep : string, default \\t (tab-stop) + Delimiter to use. Regular expressions are accepted.""" + +_read_csv_doc = """ +Read CSV (comma-separated) file into DataFrame + +%s +""" % (_parser_params % _csv_sep) + +_read_table_doc = """ +Read general delimited file into DataFrame + +%s +""" % (_parser_params % _table_sep) + +_fwf_widths = """\ +colspecs : a list of pairs (tuples), giving the extents + of the fixed-width fields of each line as half-open internals + (i.e., [from, to[ ). +widths : a list of field widths, which can be used instead of + 'colspecs' if the intervals are contiguous. +""" + +_read_fwf_doc = """ +Read a table of fixed-width formatted lines into DataFrame + +%s + +Also, 'delimiter' is used to specify the filler character of the +fields if it is not spaces (e.g., '~'). +""" % (_parser_params % _fwf_widths) + + +def _is_url(url): + """ + Very naive check to see if url is an http(s), ftp, or file location. + """ + parsed_url = urlparse(url) + if parsed_url.scheme in ['http','file', 'ftp', 'https']: + return True + else: + return False + +def _read(cls, filepath_or_buffer, kwds): + "Generic reader of line files." + encoding = kwds.get('encoding', None) + + if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): + from urllib2 import urlopen + filepath_or_buffer = urlopen(filepath_or_buffer) + if py3compat.PY3: # pragma: no cover + if encoding: + errors = 'strict' + else: + errors = 'replace' + encoding = 'utf-8' + bytes = filepath_or_buffer.read() + filepath_or_buffer = StringIO(bytes.decode(encoding, errors)) + + if hasattr(filepath_or_buffer, 'read'): + f = filepath_or_buffer + else: + try: + # universal newline mode + f = com._get_handle(filepath_or_buffer, 'U', encoding=encoding) + except Exception: # pragma: no cover + f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding) + + if kwds.get('date_parser', None) is not None: + if isinstance(kwds['parse_dates'], bool): + kwds['parse_dates'] = True + + # Extract some of the arguments (pass chunksize on). + kwds.pop('filepath_or_buffer') + iterator = kwds.pop('iterator') + nrows = kwds.pop('nrows') + chunksize = kwds.get('chunksize', None) + + # Create the parser. + parser = cls(f, **kwds) + + if nrows is not None: + return parser.get_chunk(nrows) + elif chunksize or iterator: + return parser + + return parser.get_chunk() + +@Appender(_read_csv_doc) +def read_csv(filepath_or_buffer, + sep=',', + dialect=None, + header=0, + index_col=None, + names=None, + skiprows=None, + na_values=None, + thousands=None, + comment=None, + parse_dates=False, + keep_date_col=False, + dayfirst=False, + date_parser=None, + nrows=None, + iterator=False, + chunksize=None, + skip_footer=0, + converters=None, + verbose=False, + delimiter=None, + encoding=None, + squeeze=False): + kwds = dict(filepath_or_buffer=filepath_or_buffer, + sep=sep, dialect=dialect, + header=header, index_col=index_col, + names=names, skiprows=skiprows, + na_values=na_values, thousands=thousands, + comment=comment, parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, date_parser=date_parser, + nrows=nrows, iterator=iterator, + chunksize=chunksize, skip_footer=skip_footer, + converters=converters, verbose=verbose, + delimiter=delimiter, encoding=encoding, + squeeze=squeeze) + + # Alias sep -> delimiter. + sep = kwds.pop('sep') + if kwds.get('delimiter', None) is None: + kwds['delimiter'] = sep + + return _read(TextParser, filepath_or_buffer, kwds) + +@Appender(_read_table_doc) +def read_table(filepath_or_buffer, + sep='\t', + dialect=None, + header=0, + index_col=None, + names=None, + skiprows=None, + na_values=None, + thousands=None, + comment=None, + parse_dates=False, + keep_date_col=False, + dayfirst=False, + date_parser=None, + nrows=None, + iterator=False, + chunksize=None, + skip_footer=0, + converters=None, + verbose=False, + delimiter=None, + encoding=None, + squeeze=False): + kwds = dict(filepath_or_buffer=filepath_or_buffer, + sep=sep, dialect=dialect, + header=header, index_col=index_col, + names=names, skiprows=skiprows, + na_values=na_values, thousands=thousands, + comment=comment, parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, date_parser=date_parser, + nrows=nrows, iterator=iterator, + chunksize=chunksize, skip_footer=skip_footer, + converters=converters, verbose=verbose, + delimiter=delimiter, encoding=encoding, + squeeze=squeeze) + + # Alias sep -> delimiter. + sep = kwds.pop('sep') + if kwds.get('delimiter', None) is None: + kwds['delimiter'] = sep + + # Override as default encoding. + kwds['encoding'] = None + + return _read(TextParser, filepath_or_buffer, kwds) + +@Appender(_read_fwf_doc) +def read_fwf(filepath_or_buffer, + colspecs=None, + widths=None, + header=0, + index_col=None, + names=None, + skiprows=None, + na_values=None, + thousands=None, + comment=None, + parse_dates=False, + keep_date_col=False, + dayfirst=False, + date_parser=None, + nrows=None, + iterator=False, + chunksize=None, + skip_footer=0, + converters=None, + delimiter=None, + verbose=False, + encoding=None, + squeeze=False): + kwds = dict(filepath_or_buffer=filepath_or_buffer, + colspecs=colspecs, widths=widths, + header=header, index_col=index_col, + names=names, skiprows=skiprows, + na_values=na_values, thousands=thousands, + comment=comment, parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, date_parser=date_parser, + nrows=nrows, iterator=iterator, + chunksize=chunksize, skip_footer=skip_footer, + converters=converters, verbose=verbose, + delimiter=delimiter, encoding=encoding, + squeeze=squeeze) + + # Check input arguments. + colspecs = kwds.get('colspecs', None) + widths = kwds.pop('widths', None) + if bool(colspecs is None) == bool(widths is None): + raise ValueError("You must specify only one of 'widths' and " + "'colspecs'") + + # Compute 'colspec' from 'widths', if specified. + if widths is not None: + colspecs, col = [], 0 + for w in widths: + colspecs.append( (col, col+w) ) + col += w + kwds['colspecs'] = colspecs + + kwds['thousands'] = thousands + return _read(FixedWidthFieldParser, filepath_or_buffer, kwds) + +def read_clipboard(**kwargs): # pragma: no cover + """ + Read text from clipboard and pass to read_table. See read_table for the + full argument list + + Returns + ------- + parsed : DataFrame + """ + from pandas.util.clipboard import clipboard_get + text = clipboard_get() + return read_table(StringIO(text), **kwargs) + +def to_clipboard(obj): # pragma: no cover + """ + Attempt to write text representation of object to the system clipboard + + Notes + ----- + Requirements for your platform + - Linux: xsel command line tool + - Windows: Python win32 extensions + - OS X: + """ + from pandas.util.clipboard import clipboard_set + clipboard_set(str(obj)) + +class BufferedReader(object): + """ + For handling different kinds of files, e.g. zip files where reading out a + chunk of lines is faster than reading out one line at a time. + """ + + def __init__(self, fh, delimiter=','): + pass # pragma: no coverage + +class BufferedCSVReader(BufferedReader): + pass + + +# common NA values +# no longer excluding inf representations +# '1.#INF','-1.#INF', '1.#INF000000', +_NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', + '#N/A N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '']) + + +class TextParser(object): + """ + Converts lists of lists/tuples into DataFrames with proper type inference + and optional (e.g. string to datetime) conversion. Also enables iterating + lazily over chunks of large files + + Parameters + ---------- + data : file-like object or list + delimiter : separator character to use + dialect : str or csv.Dialect instance, default None + Ignored if delimiter is longer than 1 character + names : sequence, default + header : int, default 0 + Row to use to parse column labels. Defaults to the first row. Prior + rows will be discarded + index_col : int or list, default None + Column or columns to use as the (possibly hierarchical) index + na_values : iterable, default None + Custom NA values + thousands : str, default None + Thousands separator + comment : str, default None + Comment out remainder of line + parse_dates : boolean, default False + keep_date_col : boolean, default False + date_parser : function, default None + skiprows : list of integers + Row numbers to skip + skip_footer : int + Number of line at bottom of file to skip + encoding : string, default None + Encoding to use for UTF when reading/writing (ex. 'utf-8') + squeeze : boolean, default False + returns Series if only one column + """ + + def __init__(self, f, delimiter=None, dialect=None, names=None, header=0, + index_col=None, na_values=None, thousands=None, + comment=None, parse_dates=False, keep_date_col=False, + date_parser=None, dayfirst=False, + chunksize=None, skiprows=None, skip_footer=0, converters=None, + verbose=False, encoding=None, squeeze=False): + """ + Workhorse function for processing nested list into DataFrame + + Should be replaced by np.genfromtxt eventually? + """ + self.data = None + self.buf = [] + self.pos = 0 + self.names = list(names) if names is not None else names + self.header = header + self.index_col = index_col + self.chunksize = chunksize + self.passed_names = names is not None + self.encoding = encoding + + self.parse_dates = parse_dates + self.keep_date_col = keep_date_col + self.date_parser = date_parser + self.dayfirst = dayfirst + + if com.is_integer(skiprows): + skiprows = range(skiprows) + self.skiprows = set() if skiprows is None else set(skiprows) + + self.skip_footer = skip_footer + self.delimiter = delimiter + self.dialect = dialect + self.verbose = verbose + + if converters is not None: + assert(isinstance(converters, dict)) + self.converters = converters + else: + self.converters = {} + + assert(self.skip_footer >= 0) + + if na_values is None: + self.na_values = _NA_VALUES + elif isinstance(na_values, dict): + self.na_values = na_values + else: + self.na_values = set(list(na_values)) | _NA_VALUES + + self.thousands = thousands + self.comment = comment + self._comment_lines = [] + + if hasattr(f, 'readline'): + self._make_reader(f) + else: + self.data = f + self.columns = self._infer_columns() + + # needs to be cleaned/refactored + # multiple date column thing turning into a real sphaghetti factory + + # get popped off for index + self.orig_columns = list(self.columns) + + self.index_name = None + self._name_processed = False + if not self._has_complex_date_col: + self.index_name = self._get_index_name() + self._name_processed = True + self._first_chunk = True + + self.squeeze = squeeze + + def _make_reader(self, f): + sep = self.delimiter + + if sep is None or len(sep) == 1: + sniff_sep = True + # default dialect + if self.dialect is None: + dia = csv.excel() + elif isinstance(self.dialect, basestring): + dia = csv.get_dialect(self.dialect) + else: + dia = self.dialect + + if sep is not None: + sniff_sep = False + dia.delimiter = sep + # attempt to sniff the delimiter + if sniff_sep: + line = f.readline() + while self.pos in self.skiprows: + self.pos += 1 + line = f.readline() + + line = self._check_comments([line])[0] + + self.pos += 1 + sniffed = csv.Sniffer().sniff(line) + dia.delimiter = sniffed.delimiter + if self.encoding is not None: + self.buf.extend(list( + com.UnicodeReader(StringIO(line), + dialect=dia, + encoding=self.encoding))) + else: + self.buf.extend(list(csv.reader(StringIO(line), + dialect=dia))) + + if self.encoding is not None: + reader = com.UnicodeReader(f, dialect=dia, + encoding=self.encoding) + else: + reader = csv.reader(f, dialect=dia) + else: + reader = (re.split(sep, line.strip()) for line in f) + + self.data = reader + + def _infer_columns(self): + names = self.names + passed_names = self.names is not None + if passed_names: + self.header = None + + if self.header is not None: + if len(self.buf) > 0: + line = self.buf[0] + else: + line = self._next_line() + + while self.pos <= self.header: + line = self._next_line() + + columns = [] + for i, c in enumerate(line): + if c == '': + columns.append('Unnamed: %d' % i) + else: + columns.append(c) + + counts = {} + for i, col in enumerate(columns): + cur_count = counts.get(col, 0) + if cur_count > 0: + columns[i] = '%s.%d' % (col, cur_count) + counts[col] = cur_count + 1 + self._clear_buffer() + else: + line = self._next_line() + + ncols = len(line) + if not names: + columns = ['X.%d' % (i + 1) for i in range(ncols)] + else: + columns = names + + return columns + + def _next_line(self): + if isinstance(self.data, list): + while self.pos in self.skiprows: + self.pos += 1 + + try: + line = self.data[self.pos] + except IndexError: + raise StopIteration + else: + while self.pos in self.skiprows: + next(self.data) + self.pos += 1 + + line = next(self.data) + + line = self._check_comments([line])[0] + line = self._check_thousands([line])[0] + + self.pos += 1 + self.buf.append(line) + + return line + + def _check_comments(self, lines): + if self.comment is None: + return lines + ret = [] + for l in lines: + rl = [] + for x in l: + if (not isinstance(x, basestring) or + self.comment not in x): + rl.append(x) + else: + x = x[:x.find(self.comment)] + if len(x) > 0: + rl.append(x) + break + ret.append(rl) + return ret + + def _check_thousands(self, lines): + if self.thousands is None: + return lines + nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands) + ret = [] + for l in lines: + rl = [] + for x in l: + if (not isinstance(x, basestring) or + self.thousands not in x or + nonnum.search(x.strip())): + rl.append(x) + else: + rl.append(x.replace(',', '')) + ret.append(rl) + return ret + + def _clear_buffer(self): + self.buf = [] + + def __iter__(self): + try: + while True: + yield self.get_chunk(self.chunksize) + except StopIteration: + pass + + _implicit_index = False + + def _get_index_name(self, columns=None): + if columns is None: + columns = self.columns + + try: + line = self._next_line() + except StopIteration: + line = None + + try: + next_line = self._next_line() + except StopIteration: + next_line = None + + index_name = None + + # implicitly index_col=0 b/c 1 fewer column names + implicit_first_cols = 0 + if line is not None: + implicit_first_cols = len(line) - len(columns) + if next_line is not None: + if len(next_line) == len(line) + len(columns): + implicit_first_cols = 0 + self.index_col = range(len(line)) + self.buf = self.buf[1:] + return line + + if implicit_first_cols > 0: + self._implicit_index = True + if self.index_col is None: + if implicit_first_cols == 1: + self.index_col = 0 + else: + self.index_col = range(implicit_first_cols) + index_name = None + elif np.isscalar(self.index_col): + if isinstance(self.index_col, basestring): + index_name = self.index_col + for i, c in enumerate(list(columns)): + if c == self.index_col: + self.index_col = i + columns.pop(i) + break + else: + index_name = columns.pop(self.index_col) + + if index_name is not None and 'Unnamed' in index_name: + index_name = None + + elif self.index_col is not None: + cp_cols = list(columns) + index_name = [] + index_col = list(self.index_col) + for i, c in enumerate(index_col): + if isinstance(c, basestring): + index_name.append(c) + for j, name in enumerate(cp_cols): + if name == c: + index_col[i] = j + columns.remove(name) + break + else: + name = cp_cols[c] + columns.remove(name) + index_name.append(name) + self.index_col = index_col + + return index_name + + def get_chunk(self, rows=None): + if rows is not None and self.skip_footer: + raise ValueError('skip_footer not supported for iteration') + + try: + content = self._get_lines(rows) + except StopIteration: + if self._first_chunk: + content = [] + else: + raise + + # done with first read, next time raise StopIteration + self._first_chunk = False + + if len(content) == 0: # pragma: no cover + if self.index_col is not None: + if np.isscalar(self.index_col): + index = Index([], name=self.index_name) + else: + index = MultiIndex.from_arrays([[]] * len(self.index_col), + names=self.index_name) + else: + index = Index([]) + + return DataFrame(index=index, columns=self.columns) + + zipped_content = list(lib.to_object_array(content).T) + + if not self._has_complex_date_col and self.index_col is not None: + index = self._get_simple_index(zipped_content) + index = self._agg_index(index) + else: + index = Index(np.arange(len(content))) + + col_len, zip_len = len(self.columns), len(zipped_content) + if col_len != zip_len: + row_num = -1 + for (i, l) in enumerate(content): + if len(l) != col_len: + break + + footers = 0 + if self.skip_footer: + footers = self.skip_footer + row_num = self.pos - (len(content) - i + footers) + + msg = ('Expecting %d columns, got %d in row %d' % + (col_len, zip_len, row_num)) + raise ValueError(msg) + + data = dict((k, v) for k, v in izip(self.columns, zipped_content)) + + # apply converters + for col, f in self.converters.iteritems(): + if isinstance(col, int) and col not in self.columns: + col = self.columns[col] + data[col] = lib.map_infer(data[col], f) + + columns = list(self.columns) + if self.parse_dates is not None: + data, columns = self._process_date_conversion(data) + + data = _convert_to_ndarrays(data, self.na_values, self.verbose) + + df = DataFrame(data=data, columns=columns, index=index) + if self._has_complex_date_col and self.index_col is not None: + if not self._name_processed: + self.index_name = self._get_index_name(list(columns)) + self._name_processed = True + data = dict(((k, v) for k, v in df.iteritems())) + index = self._get_complex_date_index(data, col_names=columns, + parse_dates=False) + index = self._agg_index(index, False) + data = dict(((k, v.values) for k, v in data.iteritems())) + df = DataFrame(data=data, columns=columns, index=index) + + if self.squeeze and len(df.columns) == 1: + return df[df.columns[0]] + return df + + @property + def _has_complex_date_col(self): + return (isinstance(self.parse_dates, dict) or + (isinstance(self.parse_dates, list) and + len(self.parse_dates) > 0 and + isinstance(self.parse_dates[0], list))) + + def _get_simple_index(self, data): + def ix(col): + if not isinstance(col, basestring): + return col + raise ValueError('Index %s invalid' % col) + index = None + if np.isscalar(self.index_col): + index = data.pop(ix(self.index_col)) + else: # given a list of index + to_remove = [] + index = [] + for idx in self.index_col: + i = ix(idx) + to_remove.append(i) + index.append(data[idx]) + + # remove index items from content and columns, don't pop in + # loop + for i in reversed(sorted(to_remove)): + data.pop(i) + + return index + + def _get_complex_date_index(self, data, col_names=None, parse_dates=True): + def _get_name(icol): + if isinstance(icol, basestring): + return icol + + if col_names is None: + raise ValueError(('Must supply column order to use %s as ' + 'index') % str(icol)) + + for i, c in enumerate(col_names): + if i == icol: + return c + + index = None + if np.isscalar(self.index_col): + name = _get_name(self.index_col) + index = data.pop(name) + if col_names is not None: + col_names.remove(name) + else: # given a list of index + to_remove = [] + index = [] + for idx in self.index_col: + c = _get_name(idx) + to_remove.append(c) + index.append(data[c]) + + # remove index items from content and columns, don't pop in + # loop + for c in reversed(sorted(to_remove)): + data.pop(c) + if col_names is not None: + col_names.remove(c) + + return index + + def _agg_index(self, index, try_parse_dates=True): + if np.isscalar(self.index_col): + if try_parse_dates and self._should_parse_dates(self.index_col): + index = self._conv_date(index) + index, na_count = _convert_types(index, self.na_values) + index = Index(index, name=self.index_name) + if self.verbose and na_count: + print 'Found %d NA values in the index' % na_count + else: + arrays = [] + for i, arr in enumerate(index): + if (try_parse_dates and + self._should_parse_dates(self.index_col[i])): + arr = self._conv_date(arr) + arr, _ = _convert_types(arr, self.na_values) + arrays.append(arr) + index = MultiIndex.from_arrays(arrays, names=self.index_name) + return index + + def _should_parse_dates(self, i): + if isinstance(self.parse_dates, bool): + return self.parse_dates + else: + if np.isscalar(self.index_col): + name = self.index_name + else: + name = self.index_name[i] + + if np.isscalar(self.parse_dates): + return (i == self.parse_dates) or (name == self.parse_dates) + else: + return (i in self.parse_dates) or (name in self.parse_dates) + + def _conv_date(self, *date_cols): + if self.date_parser is None: + return lib.try_parse_dates(_concat_date_cols(date_cols), + dayfirst=self.dayfirst) + else: + try: + return self.date_parser(*date_cols) + except Exception, inst: + try: + return generic_parser(self.date_parser, *date_cols) + except Exception, inst: + return lib.try_parse_dates(_concat_date_cols(date_cols), + parser=self.date_parser, + dayfirst=self.dayfirst) + + def _process_date_conversion(self, data_dict): + new_cols = [] + new_data = {} + columns = self.columns + date_cols = set() + + if self.parse_dates is None or isinstance(self.parse_dates, bool): + return data_dict, columns + + if isinstance(self.parse_dates, list): + # list of column lists + for colspec in self.parse_dates: + if np.isscalar(colspec): + if isinstance(colspec, int) and colspec not in data_dict: + colspec = self.orig_columns[colspec] + if self._isindex(colspec): + continue + data_dict[colspec] = self._conv_date(data_dict[colspec]) + else: + new_name, col, old_names = _try_convert_dates( + self._conv_date, colspec, data_dict, self.orig_columns) + if new_name in data_dict: + raise ValueError('New date column already in dict %s' % + new_name) + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + elif isinstance(self.parse_dates, dict): + # dict of new name to column list + for new_name, colspec in self.parse_dates.iteritems(): + if new_name in data_dict: + raise ValueError('Date column %s already in dict' % + new_name) + + _, col, old_names = _try_convert_dates( + self._conv_date, colspec, data_dict, self.orig_columns) + + new_data[new_name] = col + new_cols.append(new_name) + date_cols.update(old_names) + + data_dict.update(new_data) + new_cols.extend(columns) + + if not self.keep_date_col: + for c in list(date_cols): + data_dict.pop(c) + new_cols.remove(c) + return data_dict, new_cols + + def _isindex(self, colspec): + return (colspec == self.index_col or + (isinstance(self.index_col, list) and + colspec in self.index_col) or + (colspec == self.index_name or + (isinstance(self.index_name, list) and + colspec in self.index_name))) + + def _get_lines(self, rows=None): + source = self.data + lines = self.buf + + # already fetched some number + if rows is not None: + rows -= len(self.buf) + + if isinstance(source, list): + if self.pos > len(source): + raise StopIteration + if rows is None: + lines.extend(source[self.pos:]) + self.pos = len(source) + else: + lines.extend(source[self.pos:self.pos+rows]) + self.pos += rows + else: + new_rows = [] + try: + if rows is not None: + for _ in xrange(rows): + new_rows.append(next(source)) + lines.extend(new_rows) + else: + rows = 0 + while True: + try: + new_rows.append(next(source)) + rows += 1 + except csv.Error, inst: + if 'newline inside string' in inst.message: + row_num = str(self.pos + rows) + msg = ('EOF inside string starting with line ' + + row_num) + raise Exception(msg) + raise + except StopIteration: + lines.extend(new_rows) + if len(lines) == 0: + raise + self.pos += len(new_rows) + + self.buf = [] + + if self.skip_footer: + lines = lines[:-self.skip_footer] + + lines = self._check_comments(lines) + return self._check_thousands(lines) + +def _convert_to_ndarrays(dct, na_values, verbose=False): + def _get_na_values(col): + if isinstance(na_values, dict): + if col in na_values: + return set(list(na_values[col])) + else: + return _NA_VALUES + else: + return na_values + + result = {} + for c, values in dct.iteritems(): + col_na_values = _get_na_values(c) + cvals, na_count = _convert_types(values, col_na_values) + result[c] = cvals + if verbose and na_count: + print 'Filled %d NA values in column %s' % (na_count, str(c)) + return result + +def _convert_types(values, na_values): + na_count = 0 + if issubclass(values.dtype.type, (np.number, np.bool_)): + mask = lib.ismember(values, na_values) + na_count = mask.sum() + if na_count > 0: + if com.is_integer_dtype(values): + values = values.astype(np.float64) + np.putmask(values, mask, np.nan) + return values, na_count + + try: + result = lib.maybe_convert_numeric(values, na_values, False) + except Exception: + na_count = lib.sanitize_objects(values, na_values, False) + result = values + + if result.dtype == np.object_: + result = lib.maybe_convert_bool(values) + + return result, na_count + +def _get_col_names(colspec, columns): + colset = set(columns) + colnames = [] + for c in colspec: + if c in colset: + colnames.append(str(c)) + elif isinstance(c, int): + colnames.append(str(columns[c])) + return colnames + +def _try_convert_dates(parser, colspec, data_dict, columns): + colspec = _get_col_names(colspec, columns) + new_name = '_'.join(colspec) + + to_parse = [data_dict[c] for c in colspec if c in data_dict] + try: + new_col = parser(*to_parse) + except DateConversionError: + new_col = parser(_concat_date_cols(to_parse)) + return new_name, new_col, colspec + +def _concat_date_cols(date_cols): + if len(date_cols) == 1: + return date_cols[0] + + # stripped = [map(str.strip, x) for x in date_cols] + return np.array([' '.join(x) for x in zip(*date_cols)], dtype=object) + + +class FixedWidthReader(object): + """ + A reader of fixed-width lines. + """ + def __init__(self, f, colspecs, filler, thousands=None): + self.f = f + self.colspecs = colspecs + self.filler = filler # Empty characters between fields. + self.thousands = thousands + + assert isinstance(colspecs, (tuple, list)) + for colspec in colspecs: + assert isinstance(colspec, (tuple, list)) + assert len(colspec) == 2 + assert isinstance(colspec[0], int) + assert isinstance(colspec[1], int) + + def next(self): + line = next(self.f) + # Note: 'colspecs' is a sequence of half-open intervals. + return [line[fromm:to].strip(self.filler or ' ') + for (fromm, to) in self.colspecs] + + # Iterator protocol in Python 3 uses __next__() + __next__ = next + + +class FixedWidthFieldParser(TextParser): + """ + Specialization that Converts fixed-width fields into DataFrames. + See TextParser for details. + """ + def __init__(self, f, **kwds): + # Support iterators, convert to a list. + self.colspecs = list(kwds.pop('colspecs')) + + TextParser.__init__(self, f, **kwds) + + def _make_reader(self, f): + self.data = FixedWidthReader(f, self.colspecs, self.delimiter) + + +#---------------------------------------------------------------------- +# ExcelFile class + +_openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n" + "You can install it via 'easy_install openpyxl' or " + "'pip install openpyxl'.\nAlternatively, you could save" + " the .xlsx file as a .xls file.\n") + + +class ExcelFile(object): + """ + Class for parsing tabular excel sheets into DataFrame objects. + Uses xlrd for parsing .xls files or openpyxl for .xlsx files. + See ExcelFile.parse for more documentation + + Parameters + ---------- + path : string or file-like object + Path to xls file + kind : {'xls', 'xlsx', None}, default None + """ + def __init__(self, path_or_buf): + self.use_xlsx = True + self.path_or_buf = path_or_buf + self.tmpfile = None + + if isinstance(path_or_buf, basestring): + if path_or_buf.endswith('.xls'): + self.use_xlsx = False + import xlrd + self.book = xlrd.open_workbook(path_or_buf) + else: + try: + from openpyxl.reader.excel import load_workbook + self.book = load_workbook(path_or_buf, use_iterators=True) + except ImportError: # pragma: no cover + raise ImportError(_openpyxl_msg) + else: + data = path_or_buf.read() + + try: + import xlrd + self.book = xlrd.open_workbook(file_contents=data) + self.use_xlsx = False + except Exception: + from openpyxl.reader.excel import load_workbook + buf = py3compat.BytesIO(data) + self.book = load_workbook(buf, use_iterators=True) + + def __repr__(self): + return object.__repr__(self) + + def parse(self, sheetname, header=0, skiprows=None, index_col=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, chunksize=None): + """ + Read Excel table into DataFrame + + Parameters + ---------- + sheetname : string + Name of Excel sheet + header : int, default 0 + Row to use for the column labels of the parsed DataFrame + skiprows : list-like + Row numbers to skip (0-indexed) + index_col : int, default None + Column to use as the row labels of the DataFrame. Pass None if + there is no such column + na_values : list-like, default None + List of additional strings to recognize as NA/NaN + + Returns + ------- + parsed : DataFrame + """ + choose = {True:self._parse_xlsx, + False:self._parse_xls} + return choose[self.use_xlsx](sheetname, header=header, + skiprows=skiprows, index_col=index_col, + parse_dates=parse_dates, + date_parser=date_parser, + na_values=na_values, + thousands=thousands, + chunksize=chunksize) + + def _parse_xlsx(self, sheetname, header=0, skiprows=None, index_col=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, chunksize=None): + sheet = self.book.get_sheet_by_name(name=sheetname) + data = [] + + # it brings a new method: iter_rows() + for row in sheet.iter_rows(): + data.append([cell.internal_value for cell in row]) + + if header is not None: + data[header] = _trim_excel_header(data[header]) + + parser = TextParser(data, header=header, index_col=index_col, + na_values=na_values, + thousands=thousands, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + chunksize=chunksize) + + return parser.get_chunk() + + def _parse_xls(self, sheetname, header=0, skiprows=None, index_col=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, chunksize=None): + from datetime import MINYEAR, time, datetime + from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR + + datemode = self.book.datemode + sheet = self.book.sheet_by_name(sheetname) + + data = [] + for i in range(sheet.nrows): + row = [] + for value, typ in izip(sheet.row_values(i), sheet.row_types(i)): + if typ == XL_CELL_DATE: + dt = xldate_as_tuple(value, datemode) + # how to produce this first case? + if dt[0] < MINYEAR: # pragma: no cover + value = time(*dt[3:]) + else: + value = datetime(*dt) + if typ == XL_CELL_ERROR: + value = np.nan + row.append(value) + data.append(row) + + if header is not None: + data[header] = _trim_excel_header(data[header]) + + parser = TextParser(data, header=header, index_col=index_col, + na_values=na_values, + thousands=thousands, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + chunksize=chunksize) + + return parser.get_chunk() + + @property + def sheet_names(self): + if self.use_xlsx: + return self.book.get_sheet_names() + else: + return self.book.sheet_names() + + +def _trim_excel_header(row): + # trim header row so auto-index inference works + while len(row) > 0 and row[0] == '': + row = row[1:] + return row + +class ExcelWriter(object): + """ + Class for writing DataFrame objects into excel sheets, uses xlwt for xls, + openpyxl for xlsx. See DataFrame.to_excel for typical usage. + + Parameters + ---------- + path : string + Path to xls file + """ + def __init__(self, path): + self.use_xlsx = True + if path.endswith('.xls'): + self.use_xlsx = False + import xlwt + self.book = xlwt.Workbook() + self.fm_datetime = xlwt.easyxf(num_format_str='YYYY-MM-DD HH:MM:SS') + self.fm_date = xlwt.easyxf(num_format_str='YYYY-MM-DD') + else: + from openpyxl.workbook import Workbook + self.book = Workbook(optimized_write = True) + self.path = path + self.sheets = {} + self.cur_sheet = None + + def save(self): + """ + Save workbook to disk + """ + self.book.save(self.path) + + def writerow(self, row, sheet_name=None): + """ + Write the given row into Excel an excel sheet + + Parameters + ---------- + row : list + Row of data to save to Excel sheet + sheet_name : string, default None + Name of Excel sheet, if None, then use self.cur_sheet + """ + if sheet_name is None: + sheet_name = self.cur_sheet + if sheet_name is None: # pragma: no cover + raise Exception('Must pass explicit sheet_name or set ' + 'cur_sheet property') + if self.use_xlsx: + self._writerow_xlsx(row, sheet_name) + else: + self._writerow_xls(row, sheet_name) + + def _writerow_xls(self, row, sheet_name): + if sheet_name in self.sheets: + sheet, row_idx = self.sheets[sheet_name] + else: + sheet = self.book.add_sheet(sheet_name) + row_idx = 0 + sheetrow = sheet.row(row_idx) + for i, val in enumerate(row): + if isinstance(val, (datetime.datetime, datetime.date)): + if isinstance(val, datetime.datetime): + sheetrow.write(i,val, self.fm_datetime) + else: + sheetrow.write(i,val, self.fm_date) + elif isinstance(val, np.int64): + sheetrow.write(i,int(val)) + elif isinstance(val, np.bool8): + sheetrow.write(i,bool(val)) + else: + sheetrow.write(i,val) + row_idx += 1 + if row_idx == 1000: + sheet.flush_row_data() + self.sheets[sheet_name] = (sheet, row_idx) + + def _writerow_xlsx(self, row, sheet_name): + if sheet_name in self.sheets: + sheet, row_idx = self.sheets[sheet_name] + else: + sheet = self.book.create_sheet() + sheet.title = sheet_name + row_idx = 0 + + conv_row = [] + for val in row: + if isinstance(val, np.int64): + val = int(val) + elif isinstance(val, np.bool8): + val = bool(val) + conv_row.append(val) + sheet.append(conv_row) + row_idx += 1 + self.sheets[sheet_name] = (sheet, row_idx) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py new file mode 100644 index 00000000..4ec17655 --- /dev/null +++ b/pandas/io/pytables.py @@ -0,0 +1,1122 @@ +""" +High level interface to PyTables for reading and writing pandas data structures +to disk +""" + +# pylint: disable-msg=E1101,W0613,W0603 + +from datetime import datetime, date +import time + +import numpy as np +from pandas import ( + Series, TimeSeries, DataFrame, Panel, Index, MultiIndex, Int64Index +) +from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel +from pandas.sparse.array import BlockIndex, IntIndex +from pandas.tseries.api import PeriodIndex, DatetimeIndex +from pandas.core.common import adjoin +from pandas.core.algorithms import match, unique + +from pandas.core.categorical import Factor +from pandas.core.common import _asarray_tuplesafe +from pandas.core.internals import BlockManager, make_block +from pandas.core.reshape import block2d_to_block3d +import pandas.core.common as com + +import pandas.lib as lib +from contextlib import contextmanager + +# reading and writing the full object in one go +_TYPE_MAP = { + Series : 'series', + SparseSeries : 'sparse_series', + TimeSeries : 'series', + DataFrame : 'frame', + SparseDataFrame : 'sparse_frame', + Panel : 'wide', + SparsePanel : 'sparse_panel' +} + +_NAME_MAP = { + 'series' : 'Series', + 'time_series' : 'TimeSeries', + 'sparse_series' : 'SparseSeries', + 'frame' : 'DataFrame', + 'sparse_frame' : 'SparseDataFrame', + 'frame_table' : 'DataFrame (Table)', + 'wide' : 'Panel', + 'sparse_panel' : 'SparsePanel', + 'wide_table' : 'Panel (Table)', + 'long' : 'LongPanel', + # legacy h5 files + 'Series' : 'Series', + 'TimeSeries' : 'TimeSeries', + 'DataFrame' : 'DataFrame', + 'DataMatrix' : 'DataMatrix' +} + +# legacy handlers +_LEGACY_MAP = { + 'Series' : 'legacy_series', + 'TimeSeries' : 'legacy_series', + 'DataFrame' : 'legacy_frame', + 'DataMatrix' : 'legacy_frame', + 'WidePanel' : 'wide_table', +} + +# oh the troubles to reduce import time +_table_mod = None +def _tables(): + global _table_mod + if _table_mod is None: + import tables + _table_mod = tables + return _table_mod + +@contextmanager +def get_store(path, mode='a', complevel=None, complib=None, + fletcher32=False): + """ + Creates an HDFStore instance. This function can be used in a with statement + + Parameters + ---------- + path : string + File path to HDF5 file + mode : {'a', 'w', 'r', 'r+'}, default 'a' + + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + complevel : int, 1-9, default 0 + If a complib is specified compression will be applied + where possible + complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None + If complevel is > 0 apply compression to objects written + in the store wherever possible + fletcher32 : bool, default False + If applying compression use the fletcher32 checksum + + Examples + -------- + >>> with get_store('test.h5') as store: + >>> store['foo'] = bar # write to HDF5 + >>> bar = store['foo'] # retrieve + """ + store = None + try: + store = HDFStore(path, mode=mode, complevel=complevel, + complib=complib, fletcher32=False) + yield store + finally: + if store is not None: + store.close() + +class HDFStore(object): + """ + dict-like IO interface for storing pandas objects in PyTables + format. + + DataFrame and Panel can be stored in Table format, which is slower to + read and write but can be searched and manipulated more like an SQL + table. See HDFStore.put for more information + + Parameters + ---------- + path : string + File path to HDF5 file + mode : {'a', 'w', 'r', 'r+'}, default 'a' + + ``'r'`` + Read-only; no data can be modified. + ``'w'`` + Write; a new file is created (an existing file with the same + name would be deleted). + ``'a'`` + Append; an existing file is opened for reading and writing, + and if the file does not exist it is created. + ``'r+'`` + It is similar to ``'a'``, but the file must already exist. + complevel : int, 1-9, default 0 + If a complib is specified compression will be applied + where possible + complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None + If complevel is > 0 apply compression to objects written + in the store wherever possible + fletcher32 : bool, default False + If applying compression use the fletcher32 checksum + + Examples + -------- + >>> store = HDFStore('test.h5') + >>> store['foo'] = bar # write to HDF5 + >>> bar = store['foo'] # retrieve + >>> store.close() + """ + _quiet = False + + def __init__(self, path, mode='a', complevel=None, complib=None, + fletcher32=False): + try: + import tables as _ + except ImportError: # pragma: no cover + raise Exception('HDFStore requires PyTables') + + self.path = path + self.mode = mode + self.handle = None + self.complevel = complevel + self.complib = complib + self.fletcher32 = fletcher32 + self.filters = None + self.open(mode=mode, warn=False) + + def __getitem__(self, key): + return self.get(key) + + def __setitem__(self, key, value): + self.put(key, value) + + def __contains__(self, key): + return hasattr(self.handle.root, key) + + def __len__(self): + return len(self.handle.root._v_children) + + def __repr__(self): + output = '%s\nFile path: %s\n' % (type(self), self.path) + + if len(self) > 0: + keys = [] + values = [] + for k, v in sorted(self.handle.root._v_children.iteritems()): + kind = v._v_attrs.pandas_type + + keys.append(str(k)) + values.append(_NAME_MAP[kind]) + + output += adjoin(5, keys, values) + else: + output += 'Empty' + + return output + + def keys(self): + """ + Return a (potentially unordered) list of the keys corresponding to the + objects stored in the HDFStore + """ + return self.handle.root._v_children.keys() + + def open(self, mode='a', warn=True): + """ + Open the file in the specified mode + + Parameters + ---------- + mode : {'a', 'w', 'r', 'r+'}, default 'a' + See HDFStore docstring or tables.openFile for info about modes + """ + self.mode = mode + if warn and mode == 'w': # pragma: no cover + while True: + response = raw_input("Re-opening as mode='w' will delete the " + "current file. Continue (y/n)?") + if response == 'y': + break + elif response == 'n': + return + if self.handle is not None and self.handle.isopen: + self.handle.close() + + if self.complib is not None: + if self.complevel is None: + self.complevel = 9 + self.filters = _tables().Filters(self.complevel, + self.complib, + fletcher32=self.fletcher32) + + try: + self.handle = _tables().openFile(self.path, self.mode) + except IOError, e: # pragma: no cover + if 'can not be written' in str(e): + print 'Opening %s in read-only mode' % self.path + self.handle = _tables().openFile(self.path, 'r') + else: + raise + + def close(self): + """ + Close the PyTables file handle + """ + self.handle.close() + + def flush(self): + """ + Force all buffered modifications to be written to disk + """ + self.handle.flush() + + def get(self, key): + """ + Retrieve pandas object stored in file + + Parameters + ---------- + key : object + + Returns + ------- + obj : type of object stored in file + """ + try: + group = getattr(self.handle.root, key) + return self._read_group(group) + except AttributeError: + raise + + def select(self, key, where=None): + """ + Retrieve pandas object stored in file, optionally based on where + criteria + + Parameters + ---------- + key : object + where : list, optional + + Must be a list of dict objects of the following forms. Selection can + be performed on the 'index' or 'column' fields. + + Comparison op + {'field' : 'index', + 'op' : '>=', + 'value' : value} + + Match single value + {'field' : 'index', + 'value' : v1} + + Match a set of values + {'field' : 'index', + 'value' : [v1, v2, v3]} + + """ + group = getattr(self.handle.root, key, None) + if 'table' not in group._v_attrs.pandas_type: + raise Exception('can only select on objects written as tables') + if group is not None: + return self._read_group(group, where) + + def put(self, key, value, table=False, append=False, + compression=None): + """ + Store object in HDFStore + + Parameters + ---------- + key : object + value : {Series, DataFrame, Panel} + table : boolean, default False + Write as a PyTables Table structure which may perform worse but + allow more flexible operations like searching / selecting subsets of + the data + append : boolean, default False + For table data structures, append the input data to the existing + table + compression : {None, 'blosc', 'lzo', 'zlib'}, default None + Use a compression algorithm to compress the data + If None, the compression settings specified in the ctor will + be used. + """ + self._write_to_group(key, value, table=table, append=append, + comp=compression) + + def _get_handler(self, op, kind): + return getattr(self,'_%s_%s' % (op, kind)) + + def remove(self, key, where=None): + """ + Remove pandas object partially by specifying the where condition + + Parameters + ---------- + key : string + Node to remove or delete rows from + where : list + For Table node, delete specified rows. See HDFStore.select for more + information + + Parameters + ---------- + key : object + """ + if where is None: + self.handle.removeNode(self.handle.root, key, recursive=True) + else: + group = getattr(self.handle.root, key, None) + if group is not None: + self._delete_from_table(group, where) + + def append(self, key, value): + """ + Append to Table in file. Node must already exist and be Table + format. + + Parameters + ---------- + key : object + value : {Series, DataFrame, Panel} + + Notes + ----- + Does *not* check if data being appended overlaps with existing + data in the table, so be careful + """ + self._write_to_group(key, value, table=True, append=True) + + def _write_to_group(self, key, value, table=False, append=False, + comp=None): + root = self.handle.root + if key not in root._v_children: + group = self.handle.createGroup(root, key) + else: + group = getattr(root, key) + + kind = _TYPE_MAP[type(value)] + if table or (append and _is_table_type(group)): + kind = '%s_table' % kind + handler = self._get_handler(op='write', kind=kind) + wrapper = lambda value: handler(group, value, append=append, + comp=comp) + else: + if append: + raise ValueError('Can only append to Tables') + if comp: + raise ValueError('Compression only supported on Tables') + + handler = self._get_handler(op='write', kind=kind) + wrapper = lambda value: handler(group, value) + + wrapper(value) + group._v_attrs.pandas_type = kind + + def _write_series(self, group, series): + self._write_index(group, 'index', series.index) + self._write_array(group, 'values', series.values) + group._v_attrs.name = series.name + + def _write_sparse_series(self, group, series): + self._write_index(group, 'index', series.index) + self._write_index(group, 'sp_index', series.sp_index) + self._write_array(group, 'sp_values', series.sp_values) + group._v_attrs.name = series.name + group._v_attrs.fill_value = series.fill_value + group._v_attrs.kind = series.kind + + def _read_sparse_series(self, group, where=None): + index = self._read_index(group, 'index') + sp_values = _read_array(group, 'sp_values') + sp_index = self._read_index(group, 'sp_index') + name = getattr(group._v_attrs, 'name', None) + fill_value = getattr(group._v_attrs, 'fill_value', None) + kind = getattr(group._v_attrs, 'kind', 'block') + return SparseSeries(sp_values, index=index, sparse_index=sp_index, + kind=kind, fill_value=fill_value, + name=name) + + def _write_sparse_frame(self, group, sdf): + for name, ss in sdf.iteritems(): + key = 'sparse_series_%s' % name + if key not in group._v_children: + node = self.handle.createGroup(group, key) + else: + node = getattr(group, key) + self._write_sparse_series(node, ss) + setattr(group._v_attrs, 'default_fill_value', + sdf.default_fill_value) + setattr(group._v_attrs, 'default_kind', + sdf.default_kind) + self._write_index(group, 'columns', sdf.columns) + + def _read_sparse_frame(self, group, where=None): + columns = self._read_index(group, 'columns') + sdict = {} + for c in columns: + key = 'sparse_series_%s' % c + node = getattr(group, key) + sdict[c] = self._read_sparse_series(node) + default_kind = getattr(group._v_attrs, 'default_kind') + default_fill_value = getattr(group._v_attrs, 'default_fill_value') + return SparseDataFrame(sdict, columns=columns, + default_kind=default_kind, + default_fill_value=default_fill_value) + + def _write_sparse_panel(self, group, swide): + setattr(group._v_attrs, 'default_fill_value', swide.default_fill_value) + setattr(group._v_attrs, 'default_kind', swide.default_kind) + self._write_index(group, 'items', swide.items) + + for name, sdf in swide.iteritems(): + key = 'sparse_frame_%s' % name + if key not in group._v_children: + node = self.handle.createGroup(group, key) + else: + node = getattr(group, key) + self._write_sparse_frame(node, sdf) + + def _read_sparse_panel(self, group, where=None): + default_fill_value = getattr(group._v_attrs, 'default_fill_value') + default_kind = getattr(group._v_attrs, 'default_kind') + items = self._read_index(group, 'items') + + sdict = {} + for name in items: + key = 'sparse_frame_%s' % name + node = getattr(group, key) + sdict[name] = self._read_sparse_frame(node) + return SparsePanel(sdict, items=items, default_kind=default_kind, + default_fill_value=default_fill_value) + + def _write_frame(self, group, df): + self._write_block_manager(group, df._data) + + def _read_frame(self, group, where=None): + return DataFrame(self._read_block_manager(group)) + + def _write_block_manager(self, group, data): + if not data.is_consolidated(): + data = data.consolidate() + + group._v_attrs.ndim = data.ndim + for i, ax in enumerate(data.axes): + self._write_index(group, 'axis%d' % i, ax) + + # Supporting mixed-type DataFrame objects...nontrivial + nblocks = len(data.blocks) + group._v_attrs.nblocks = nblocks + for i in range(nblocks): + blk = data.blocks[i] + self._write_index(group, 'block%d_items' % i, blk.items) + self._write_array(group, 'block%d_values' % i, blk.values) + + def _read_block_manager(self, group): + ndim = group._v_attrs.ndim + + axes = [] + for i in xrange(ndim): + ax = self._read_index(group, 'axis%d' % i) + axes.append(ax) + + items = axes[0] + blocks = [] + for i in range(group._v_attrs.nblocks): + blk_items = self._read_index(group, 'block%d_items' % i) + values = _read_array(group, 'block%d_values' % i) + blk = make_block(values, blk_items, items) + blocks.append(blk) + + return BlockManager(blocks, axes) + + def _write_frame_table(self, group, df, append=False, comp=None): + mat = df.values + values = mat.reshape((1,) + mat.shape) + + if df._is_mixed_type: + raise Exception('Cannot currently store mixed-type DataFrame ' + 'objects in Table format') + + self._write_table(group, items=['value'], + index=df.index, columns=df.columns, + values=values, append=append, compression=comp) + + def _write_wide(self, group, panel): + panel._consolidate_inplace() + self._write_block_manager(group, panel._data) + + def _read_wide(self, group, where=None): + return Panel(self._read_block_manager(group)) + + def _write_wide_table(self, group, panel, append=False, comp=None): + self._write_table(group, items=panel.items, index=panel.major_axis, + columns=panel.minor_axis, values=panel.values, + append=append, compression=comp) + + def _read_wide_table(self, group, where=None): + return self._read_panel_table(group, where) + + def _write_index(self, group, key, index): + if isinstance(index, MultiIndex): + if len(index) == 0: + raise ValueError('Can not write empty structure, ' + 'axis length was 0') + + setattr(group._v_attrs, '%s_variety' % key, 'multi') + self._write_multi_index(group, key, index) + elif isinstance(index, BlockIndex): + setattr(group._v_attrs, '%s_variety' % key, 'block') + self._write_block_index(group, key, index) + elif isinstance(index, IntIndex): + setattr(group._v_attrs, '%s_variety' % key, 'sparseint') + self._write_sparse_intindex(group, key, index) + else: + if len(index) == 0: + raise ValueError('Can not write empty structure, ' + 'axis length was 0') + + setattr(group._v_attrs, '%s_variety' % key, 'regular') + converted, kind, _ = _convert_index(index) + self._write_array(group, key, converted) + node = getattr(group, key) + node._v_attrs.kind = kind + node._v_attrs.name = index.name + + if isinstance(index, (DatetimeIndex, PeriodIndex)): + node._v_attrs.index_class = _class_to_alias(type(index)) + + if hasattr(index, 'freq'): + node._v_attrs.freq = index.freq + + if hasattr(index, 'tz') and index.tz is not None: + node._v_attrs.tz = index.tz.zone + + def _read_index(self, group, key): + variety = getattr(group._v_attrs, '%s_variety' % key) + + if variety == 'multi': + return self._read_multi_index(group, key) + elif variety == 'block': + return self._read_block_index(group, key) + elif variety == 'sparseint': + return self._read_sparse_intindex(group, key) + elif variety == 'regular': + _, index = self._read_index_node(getattr(group, key)) + return index + else: # pragma: no cover + raise Exception('unrecognized index variety: %s' % variety) + + def _write_block_index(self, group, key, index): + self._write_array(group, '%s_blocs' % key, index.blocs) + self._write_array(group, '%s_blengths' % key, index.blengths) + setattr(group._v_attrs, '%s_length' % key, index.length) + + def _read_block_index(self, group, key): + length = getattr(group._v_attrs, '%s_length' % key) + blocs = _read_array(group, '%s_blocs' % key) + blengths = _read_array(group, '%s_blengths' % key) + return BlockIndex(length, blocs, blengths) + + def _write_sparse_intindex(self, group, key, index): + self._write_array(group, '%s_indices' % key, index.indices) + setattr(group._v_attrs, '%s_length' % key, index.length) + + def _read_sparse_intindex(self, group, key): + length = getattr(group._v_attrs, '%s_length' % key) + indices = _read_array(group, '%s_indices' % key) + return IntIndex(length, indices) + + def _write_multi_index(self, group, key, index): + setattr(group._v_attrs, '%s_nlevels' % key, index.nlevels) + + for i, (lev, lab, name) in enumerate(zip(index.levels, + index.labels, + index.names)): + # write the level + conv_level, kind, _ = _convert_index(lev) + level_key = '%s_level%d' % (key, i) + self._write_array(group, level_key, conv_level) + node = getattr(group, level_key) + node._v_attrs.kind = kind + node._v_attrs.name = name + + # write the name + setattr(node._v_attrs, '%s_name%d' % (key, i), name) + + # write the labels + label_key = '%s_label%d' % (key, i) + self._write_array(group, label_key, lab) + + def _read_multi_index(self, group, key): + nlevels = getattr(group._v_attrs, '%s_nlevels' % key) + + levels = [] + labels = [] + names = [] + for i in range(nlevels): + level_key = '%s_level%d' % (key, i) + name, lev = self._read_index_node(getattr(group, level_key)) + levels.append(lev) + names.append(name) + + label_key = '%s_label%d' % (key, i) + lab = getattr(group, label_key)[:] + labels.append(lab) + + return MultiIndex(levels=levels, labels=labels, names=names) + + def _read_index_node(self, node): + data = node[:] + kind = node._v_attrs.kind + name = None + + if 'name' in node._v_attrs: + name = node._v_attrs.name + + index_class = _alias_to_class(getattr(node._v_attrs, 'index_class', '')) + factory = _get_index_factory(index_class) + + kwargs = {} + if 'freq' in node._v_attrs: + kwargs['freq'] = node._v_attrs['freq'] + + if 'tz' in node._v_attrs: + kwargs['tz'] = node._v_attrs['tz'] + + if kind in ('date', 'datetime'): + index = factory(_unconvert_index(data, kind), dtype=object, + **kwargs) + else: + index = factory(_unconvert_index(data, kind), **kwargs) + + index.name = name + + return name, index + + def _write_array(self, group, key, value): + if key in group: + self.handle.removeNode(group, key) + + if self.filters is not None: + atom = None + try: + # get the atom for this datatype + atom = _tables().Atom.from_dtype(value.dtype) + except ValueError: + pass + + if atom is not None: + # create an empty chunked array and fill it from value + ca = self.handle.createCArray(group, key, atom, + value.shape, + filters=self.filters) + ca[:] = value + return + + if value.dtype.type == np.object_: + vlarr = self.handle.createVLArray(group, key, + _tables().ObjectAtom()) + vlarr.append(value) + elif value.dtype.type == np.datetime64: + self.handle.createArray(group, key, value.view('i8')) + getattr(group, key)._v_attrs.value_type = 'datetime64' + else: + self.handle.createArray(group, key, value) + + def _write_table(self, group, items=None, index=None, columns=None, + values=None, append=False, compression=None): + """ need to check for conform to the existing table: + e.g. columns should match """ + # create dict of types + index_converted, index_kind, index_t = _convert_index(index) + columns_converted, cols_kind, col_t = _convert_index(columns) + + # create the table if it doesn't exist (or get it if it does) + if not append: + if 'table' in group: + self.handle.removeNode(group, 'table') + + if 'table' not in group: + # create the table + desc = {'index' : index_t, + 'column' : col_t, + 'values' : _tables().FloatCol(shape=(len(values)))} + + options = {'name' : 'table', + 'description' : desc} + + if compression: + complevel = self.complevel + if complevel is None: + complevel = 9 + filters = _tables().Filters(complevel=complevel, + complib=compression, + fletcher32=self.fletcher32) + options['filters'] = filters + elif self.filters is not None: + options['filters'] = self.filters + + table = self.handle.createTable(group, **options) + else: + # the table must already exist + table = getattr(group, 'table', None) + + # add kinds + table._v_attrs.index_kind = index_kind + table._v_attrs.columns_kind = cols_kind + if append: + existing_fields = getattr(table._v_attrs,'fields',None) + if (existing_fields is not None and + existing_fields != list(items)): + raise Exception("appended items do not match existing items" + " in table!") + # this depends on creation order of the table + table._v_attrs.fields = list(items) + + # add the rows + try: + for i, index in enumerate(index_converted): + for c, col in enumerate(columns_converted): + v = values[:, i, c] + + # don't store the row if all values are np.nan + if np.isnan(v).all(): + continue + + row = table.row + row['index'] = index + row['column'] = col + + # create the values array + row['values'] = v + row.append() + self.handle.flush() + except (ValueError), detail: # pragma: no cover + print "value_error in _write_table -> %s" % str(detail) + try: + self.handle.flush() + except Exception: + pass + raise + + def _read_group(self, group, where=None): + kind = group._v_attrs.pandas_type + kind = _LEGACY_MAP.get(kind, kind) + handler = self._get_handler(op='read', kind=kind) + return handler(group, where) + + def _read_series(self, group, where=None): + index = self._read_index(group, 'index') + values = _read_array(group, 'values') + name = getattr(group._v_attrs, 'name', None) + return Series(values, index=index, name=name) + + def _read_legacy_series(self, group, where=None): + index = self._read_index_legacy(group, 'index') + values = _read_array(group, 'values') + return Series(values, index=index) + + def _read_legacy_frame(self, group, where=None): + index = self._read_index_legacy(group, 'index') + columns = self._read_index_legacy(group, 'columns') + values = _read_array(group, 'values') + return DataFrame(values, index=index, columns=columns) + + def _read_index_legacy(self, group, key): + node = getattr(group, key) + data = node[:] + kind = node._v_attrs.kind + + return _unconvert_index_legacy(data, kind) + + def _read_frame_table(self, group, where=None): + return self._read_panel_table(group, where)['value'] + + def _read_panel_table(self, group, where=None): + table = getattr(group, 'table') + fields = table._v_attrs.fields + + # create the selection + sel = Selection(table, where, table._v_attrs.index_kind) + sel.select() + fields = table._v_attrs.fields + + columns = _maybe_convert(sel.values['column'], + table._v_attrs.columns_kind) + index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) + values = sel.values['values'] + + major = Factor.from_array(index) + minor = Factor.from_array(columns) + + J, K = len(major.levels), len(minor.levels) + key = major.labels * K + minor.labels + + if len(unique(key)) == len(key): + sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) + sorter = com._ensure_platform_int(sorter) + + # the data need to be sorted + sorted_values = values.take(sorter, axis=0) + major_labels = major.labels.take(sorter) + minor_labels = minor.labels.take(sorter) + + block = block2d_to_block3d(sorted_values, fields, (J, K), + major_labels, minor_labels) + + mgr = BlockManager([block], [block.ref_items, + major.levels, minor.levels]) + wp = Panel(mgr) + else: + if not self._quiet: # pragma: no cover + print ('Duplicate entries in table, taking most recently ' + 'appended') + + # reconstruct + long_index = MultiIndex.from_arrays([index, columns]) + lp = DataFrame(values, index=long_index, columns=fields) + + # need a better algorithm + tuple_index = long_index._tuple_index + + unique_tuples = lib.fast_unique(tuple_index) + unique_tuples = _asarray_tuplesafe(unique_tuples) + + indexer = match(unique_tuples, tuple_index) + indexer = com._ensure_platform_int(indexer) + + new_index = long_index.take(indexer) + new_values = lp.values.take(indexer, axis=0) + + lp = DataFrame(new_values, index=new_index, columns=lp.columns) + wp = lp.to_panel() + + if sel.column_filter: + new_minor = sorted(set(wp.minor_axis) & sel.column_filter) + wp = wp.reindex(minor=new_minor) + return wp + + def _delete_from_table(self, group, where = None): + table = getattr(group, 'table') + + # create the selection + s = Selection(table, where, table._v_attrs.index_kind) + s.select_coords() + + # delete the rows in reverse order + l = list(s.values) + l.reverse() + for c in l: + table.removeRows(c) + self.handle.flush() + return len(s.values) + +def _convert_index(index): + if isinstance(index, DatetimeIndex): + converted = index.asi8 + return converted, 'datetime64', _tables().Int64Col() + elif isinstance(index, (Int64Index, PeriodIndex)): + atom = _tables().Int64Col() + return index.values, 'integer', atom + + inferred_type = lib.infer_dtype(index) + + values = np.asarray(index) + + if inferred_type == 'datetime64': + converted = values.view('i8') + return converted, 'datetime64', _tables().Int64Col() + elif inferred_type == 'datetime': + converted = np.array([(time.mktime(v.timetuple()) + + v.microsecond / 1E6) for v in values], + dtype=np.float64) + return converted, 'datetime', _tables().Time64Col() + elif inferred_type == 'date': + converted = np.array([time.mktime(v.timetuple()) for v in values], + dtype=np.int32) + return converted, 'date', _tables().Time32Col() + elif inferred_type =='string': + converted = np.array(list(values), dtype=np.str_) + itemsize = converted.dtype.itemsize + return converted, 'string', _tables().StringCol(itemsize) + elif inferred_type == 'unicode': + atom = _tables().ObjectAtom() + return np.asarray(values, dtype='O'), 'object', atom + elif inferred_type == 'integer': + # take a guess for now, hope the values fit + atom = _tables().Int64Col() + return np.asarray(values, dtype=np.int64), 'integer', atom + elif inferred_type == 'floating': + atom = _tables().Float64Col() + return np.asarray(values, dtype=np.float64), 'float', atom + else: # pragma: no cover + atom = _tables().ObjectAtom() + return np.asarray(values, dtype='O'), 'object', atom + +def _read_array(group, key): + import tables + node = getattr(group, key) + data = node[:] + + if isinstance(node, tables.VLArray): + return data[0] + else: + dtype = getattr(node._v_attrs, 'value_type', None) + if dtype == 'datetime64': + return np.array(data, dtype='M8[ns]') + return data + +def _unconvert_index(data, kind): + if kind == 'datetime64': + index = DatetimeIndex(data) + elif kind == 'datetime': + index = np.array([datetime.fromtimestamp(v) for v in data], + dtype=object) + elif kind == 'date': + index = np.array([date.fromtimestamp(v) for v in data], dtype=object) + elif kind in ('string', 'integer', 'float'): + index = np.array(data) + elif kind == 'object': + index = np.array(data[0]) + else: # pragma: no cover + raise ValueError('unrecognized index type %s' % kind) + return index + +def _unconvert_index_legacy(data, kind, legacy=False): + if kind == 'datetime': + index = lib.time64_to_datetime(data) + elif kind in ('string', 'integer'): + index = np.array(data, dtype=object) + else: # pragma: no cover + raise ValueError('unrecognized index type %s' % kind) + return index + +def _maybe_convert(values, val_kind): + if _need_convert(val_kind): + conv = _get_converter(val_kind) + # conv = np.frompyfunc(conv, 1, 1) + values = conv(values) + return values + +def _get_converter(kind): + if kind == 'datetime64': + return lambda x: np.array(x, dtype='M8[ns]') + if kind == 'datetime': + return lib.convert_timestamps + else: # pragma: no cover + raise ValueError('invalid kind %s' % kind) + +def _need_convert(kind): + if kind in ('datetime', 'datetime64'): + return True + return False + +def _is_table_type(group): + try: + return 'table' in group._v_attrs.pandas_type + except AttributeError: + # new node, e.g. + return False + +_index_type_map = {DatetimeIndex : 'datetime', + PeriodIndex : 'period'} + +_reverse_index_map = {} +for k, v in _index_type_map.iteritems(): + _reverse_index_map[v] = k + +def _class_to_alias(cls): + return _index_type_map.get(cls, '') + +def _alias_to_class(alias): + if isinstance(alias, type): # pragma: no cover + return alias # compat: for a short period of time master stored types + return _reverse_index_map.get(alias, Index) + +class Selection(object): + """ + Carries out a selection operation on a tables.Table object. + + Parameters + ---------- + table : tables.Table + where : list of dicts of the following form + + Comparison op + {'field' : 'index', + 'op' : '>=', + 'value' : value} + + Match single value + {'field' : 'index', + 'value' : v1} + + Match a set of values + {'field' : 'index', + 'value' : [v1, v2, v3]} + """ + def __init__(self, table, where=None, index_kind=None): + self.table = table + self.where = where + self.index_kind = index_kind + self.column_filter = None + self.the_condition = None + self.conditions = [] + self.values = None + if where: + self.generate(where) + + def generate(self, where): + # and condictions + for c in where: + op = c.get('op',None) + value = c['value'] + field = c['field'] + + if field == 'index' and self.index_kind == 'datetime64': + val = lib.Timestamp(value).value + self.conditions.append('(%s %s %s)' % (field,op,val)) + elif field == 'index' and isinstance(value, datetime): + value = time.mktime(value.timetuple()) + self.conditions.append('(%s %s %s)' % (field,op,value)) + else: + self.generate_multiple_conditions(op,value,field) + + if len(self.conditions): + self.the_condition = '(' + ' & '.join(self.conditions) + ')' + + def generate_multiple_conditions(self, op, value, field): + + if op and op == 'in' or isinstance(value, (list, np.ndarray)): + if len(value) <= 61: + l = '(' + ' | '.join([ "(%s == '%s')" % (field,v) + for v in value ]) + ')' + self.conditions.append(l) + else: + self.column_filter = set(value) + else: + if op is None: + op = '==' + self.conditions.append('(%s %s "%s")' % (field,op,value)) + + def select(self): + """ + generate the selection + """ + if self.the_condition: + self.values = self.table.readWhere(self.the_condition) + + else: + self.values = self.table.read() + + def select_coords(self): + """ + generate the selection + """ + self.values = self.table.getWhereList(self.the_condition) + +def _get_index_factory(klass): + if klass == DatetimeIndex: + def f(values, freq=None, tz=None): + return DatetimeIndex._simple_new(values, None, freq=freq, + tz=tz) + return f + return klass + diff --git a/pandas/io/sql.py b/pandas/io/sql.py new file mode 100644 index 00000000..01942e26 --- /dev/null +++ b/pandas/io/sql.py @@ -0,0 +1,229 @@ +""" +Collection of query wrappers / abstractions to both facilitate data +retrieval and to reduce dependency on DB-specific API. +""" +from datetime import datetime + +import numpy as np +import traceback + +from pandas.core.datetools import format as date_format +from pandas.core.api import DataFrame, isnull + +#------------------------------------------------------------------------------- +# Helper execution function + +def execute(sql, con, retry=True, cur=None, params=None): + """ + Execute the given SQL query using the provided connection object. + + Parameters + ---------- + sql: string + Query to be executed + + Returns + ------- + Cursor object + """ + try: + if cur is None: + cur = con.cursor() + + if params is None: + cur.execute(sql) + else: + cur.execute(sql, params) + return cur + except Exception: + try: + con.rollback() + except Exception: # pragma: no cover + pass + + print 'Error on sql %s' % sql + raise + +def _safe_fetch(cur): + try: + result = cur.fetchall() + if not isinstance(result, list): + result = list(result) + return result + except Exception, e: # pragma: no cover + excName = e.__class__.__name__ + if excName == 'OperationalError': + return [] + +def tquery(sql, con=None, cur=None, retry=True): + """ + Returns list of tuples corresponding to each row in given sql + query. + + If only one column selected, then plain list is returned. + + Parameters + ---------- + sql: string + SQL query to be executed + con: SQLConnection or DB API 2.0-compliant connection + cur: DB API 2.0 cursor + + Provide a specific connection or a specific cursor if you are executing a + lot of sequential statements and want to commit outside. + """ + cur = execute(sql, con, cur=cur) + result = _safe_fetch(cur) + + if con is not None: + try: + cur.close() + con.commit() + except Exception, e: + excName = e.__class__.__name__ + if excName == 'OperationalError': # pragma: no cover + print 'Failed to commit, may need to restart interpreter' + else: + raise + + traceback.print_exc() + if retry: + return tquery(sql, con=con, retry=False) + + if result and len(result[0]) == 1: + # python 3 compat + result = list(list(zip(*result))[0]) + elif result is None: # pragma: no cover + result = [] + + return result + +def uquery(sql, con=None, cur=None, retry=True, params=()): + """ + Does the same thing as tquery, but instead of returning results, it + returns the number of rows affected. Good for update queries. + """ + cur = execute(sql, con, cur=cur, retry=retry, params=params) + + result = cur.rowcount + try: + con.commit() + except Exception, e: + excName = e.__class__.__name__ + if excName != 'OperationalError': + raise + + traceback.print_exc() + if retry: + print 'Looks like your connection failed, reconnecting...' + return uquery(sql, con, retry=False) + return result + +def read_frame(sql, con, index_col=None, coerce_float=True): + """ + Returns a DataFrame corresponding to the result set of the query + string. + + Optionally provide an index_col parameter to use one of the + columns as the index. Otherwise will be 0 to len(results) - 1. + + Parameters + ---------- + sql: string + SQL query to be executed + con: DB connection object, optional + index_col: string, optional + column name to use for the returned DataFrame object. + """ + cur = execute(sql, con) + rows = _safe_fetch(cur) + columns = [col_desc[0] for col_desc in cur.description] + + cur.close() + con.commit() + + result = DataFrame.from_records(rows, columns=columns, + coerce_float=coerce_float) + + if index_col is not None: + result = result.set_index(index_col) + + return result + +frame_query = read_frame + +def write_frame(frame, name=None, con=None, flavor='sqlite'): + """ + Write records stored in a DataFrame to SQLite. The index will currently be + dropped + """ + if flavor == 'sqlite': + schema = get_sqlite_schema(frame, name) + else: + raise NotImplementedError + + con.execute(schema) + + wildcards = ','.join(['?'] * len(frame.columns)) + insert_sql = 'INSERT INTO %s VALUES (%s)' % (name, wildcards) + data = [tuple(x) for x in frame.values] + con.executemany(insert_sql, data) + +def get_sqlite_schema(frame, name): + template = """ +CREATE TABLE %(name)s ( + %(columns)s +);""" + + column_types = [] + + dtypes = frame.dtypes + for k in dtypes.index: + dt = dtypes[k] + + if issubclass(dt.type, (np.integer, np.bool_)): + sqltype = 'INTEGER' + elif issubclass(dt.type, np.floating): + sqltype = 'REAL' + else: + sqltype = 'TEXT' + + column_types.append((k, sqltype)) + + columns = ',\n '.join('%s %s' % x for x in column_types) + + return template % {'name' : name, 'columns' : columns} + + + + +#------------------------------------------------------------------------------- +# Query formatting + +_formatters = { + datetime : lambda dt: "'%s'" % date_format(dt), + str : lambda x: "'%s'" % x, + np.str_ : lambda x: "'%s'" % x, + unicode : lambda x: "'%s'" % x, + float : lambda x: "%.8f" % x, + int : lambda x: "%s" % x, + type(None) : lambda x: "NULL", + np.float64 : lambda x: "%.10f" % x, + bool : lambda x: "'%s'" % x, +} + +def format_query(sql, *args): + """ + + """ + processed_args = [] + for arg in args: + if isinstance(arg, float) and isnull(arg): + arg = None + + formatter = _formatters[type(arg)] + processed_args.append(formatter(arg)) + + return sql % tuple(processed_args) + + diff --git a/pandas/io/tests/__init__.py b/pandas/io/tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/pandas/io/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/pandas/io/tests/legacy.h5 b/pandas/io/tests/legacy.h5 new file mode 100644 index 0000000000000000000000000000000000000000..38b822dd169945b5f5022a219cc784a4c9951320 GIT binary patch literal 14928 zcmeHN2|Sfq8^0(aNlls-)tzdjME0fVxU%M2vRxdOIp-4O%o+G)mJFVB)2b> zR!SU1C>2wKk%ViP>|4lp-|f3TrP24zeA85~pZ9;3=RNN^&w0-Cod0=msJ^!D6rnjn zB;*nhAWa~NqDkU5j9;cJu@WLr)H~n_9vt`K$l}EkJS2V+Qa1}P7scBr^I-)Ic)5j{ zjt+?xV&MLp&<5Sx#dQlO6X@v4 zboJ-jvnC<%foN`QqRqpJUbwlE1SY({0m44XV(*tD`rjx1emOE;okBGHe({gCmWa=V zGq5h2;yva1qmRSi@o+pO>H~2{Nc3A2alo7lu`T4{<5&bDf)pd0oQZgzmzNiPQh?(x zkpv%)=wuFXuH|@(SZ+FQQAnYL8L#nZJ&g3@h=~&qQ_RdLHk>?H{%t>wm?$AHFQdT8 zjg%jsp3#RmH_ZNd|84&%#6lnhMj>IQ`*3}Ys{U-VOdXnvA;WUrCc2Wbs;$4VqShJ} zIjX5?fRhr{z{Fpkrf2DE%kXtIwAHtuu3=dC2T-?K(3q;$dQMs@6cq;3NZU8qc8i|5 zk-s5b(NLKYpcD|`tK{WG*L77=Hc{T>>`Hf)w=&Z6u-37qF^z1Tf&y*b6cv2dIT^?) zY+7UKq2;ckX1>nD*2%!%YK@Y=v5m67vBD->McqJy&1tzUG$3 zI*yK;186Q@pNf@cND*qJh?G2|6krD7KXgv^Qm=)}oqlUGwV+&J~O5cICj* zZ3za2T@7%CnqcS7NhPrKzVO7_;w)%hQd0SAPAL#ns~O-CZ-)!geN}8D?f{aDII|pC z85vqzQ52`|L~%>R9u%M62t~0$G7?2?lNc0ZnLndA7|lYFmUR(DPo*>z^VVmfxOjIi zin~|^D3(7eLa~ifg(B6x4n@|%W)vsfXhYHASvQIS#uJ|-{VNO*LUDQYR227R&OouI zem07Iwo6g8_)Z4Jqyz;Ng&(P+=-#D)Am6H*nxpxH@cG52z&*pcu*kXnY4D6bSjueJ zb|fepyf|qhq?%F&9d15}kIHxfLlQ#r{F|Zy%cxH*n%w{rCy*q>O5ecU*L-{u=)VDx z74z!y5*lE)Mc>BRU)O-C8fUdv);)s9&UA0DkS>FEGn4E#oahG^=1(r`yD$Xp?b3gU zF{^?_#mfd(@TCF%FvI%P^-(}Tl=n>X+-D#(M_R_ZI~59xSlOOl(+LNbP{rBOU2x@G z+bP~bselrebh>&OoVZ&L%t5fqn zyZNPPt|b3$z!~kUW{Ol4H5i_JP$EU5=nCbhFrcP zjJCsY^mcLd-xv_ga76`>u!K+vB0g$61~}^aIeKBWNcp?@3Rwngf4qF9bOk0)?DMeP zm;2|U0b~t~94imb_jz!`n~zt1sNHkrksYT1X)}lLAWb25J#wGuAK*v#c0=r05Vdtx z{JmrKgAd=YP9phZ(;jV#XpOmW^R zOp--MjPM3zKHlYTM$U`5s~;WXrSJALm0x2D9Iu}_F&mR7_T9Mpzrbgrfw9g-_ThF! zsNs)37a@3YtT^@t8vY{gy86(#kI9eZy(8oPe|vt(Lz;}mdffjJ{jQvYb(qWNMNv8A zT$1pAh{V`_Z}}9gg};~I8_nnU{mur3@GG>r*l54pF%66rN4fStzPQ4be>=G5fA2AhT7S)Q2Cm-{EDfzffDjE~R(wvRLg{NYV{=NJk7T1x{Vwdb9JTV4fdmIwU zfjou)UQGC(6N}$3_o>wWb#Yw?wNn}%SVs2)r=9=t-3bvLdw)8J+cEJZHbb8pFX#HD zxA#vYc{lew`c^5s8AQHx5|XJbfk|X(E&H4% zaQ^ZJT`QLs&e?tU9owU?p|l+TuR8(4zu83%;mnN(w*>z zaonK#&H->cQAuqmGXs{{x5*7|dkJ@_UcILGT_sd^lrpEfJq2+hZ#vKK?Sm}WBX{eL zS3>&4V;&tBs=!sZZjJ2kSztwE((`2YSs+81)*r!}1`gD0$l4{526@W;4d!_~ftUFQ zXY2F6fT;^3ykgqw;NI=5r$JAj1Njv$p356`EsK<~ot!xElTJ1qTrc8xes44MovI^g*j@^L zig-kQl2QQV*H;&a=rzFRy^^p_fQwL(Q+hY~rA_posTSZ9zgWq+Jd@pTpuLmc6>8NE#b;eRs2JS{dG#3a*_R4HB14?^t9~UoK>eO&|N1eI1pSuP@!0*L2D|tF*LH-l$TM=GQ2^FkY8tclco|j2e>lmA(87q>#g^u6I^Lnad?YafwghNv2`3pFSH*77_Tt zG`tdiZr=_G-d2fkbXd}uEBjF1hJ5!ia^xJ66B|XYNBoldqJhsw17q>bJbYadiukj5 zCT$t^1{(e%?tbp$=9y$Utiy!+_;8--z7%WW@69t$;CjHP?=2vBW*+|i{4D{z0NaIg zGT6`6#uEEv%184%f_HH3e|+bdT=}>8&%4i$|7f0xetPE23&EJMDf)l&9IhTd)NlOn zdVV+0#J;<0{+rKZ7XHWc%tOkU#{XKL`Dg3@lUev*2J)9^pky^Kxc}@KX#Tm8-d*E6 zUGwW;!?CP=Cpmnt9+t-J>}ml&9&epOmP-J!?hcGxk8+s4%P_<(z7x)EbW$*w*9aay zc~u>@{yw<+u(sc30vpEMuav1=R0XNM$>Jt^TEY3;_Wc*<6oPuWc7fyO9bms{_T7rs zSFlt3)PtNDrw9I9t3jdI$><+ezJ`19^|XfVs-V=JoCVvGdO?=N#YHyWzW{qpSiF2e zEszYk{b2A|Jx~ysrt3{@0z6UIQqGd^fmTJv-lMv$;2)i~X@{;S0K343@8m3dfpht` zqmifYf(=!V8`xXeFvF&g{5_=#lz7s7ww-SW`aAk}2(Ntw*{-HLX!onYy!Gl^4((_F zvZu~IIrQ=gbU&T_;^d)PxKvtVi|*lWu(@jcw~>!{GCDTNOpZVI3W$6iQhM064SYp) z6MQ;ltuv5t^$X0lVP*>1@ebc!d_0x?i9g773V>!UxKr z+ghXP*Y|Wo+5@RW^$%{t#i?E4TCg4%T=jc(di^y(qu5^d=^cQASsFlS#ubp@F!P>y zOB?JPvba}t^(;K2bP6O!6v81dp`FrlQK0X5wm_(CCS2^Uus7g!4M^O*Dk!XY7`UYQ z(+xY*;R*5liEW_|U_|hip)A!dC{etOA?QrXXg+Ls>e!2VsC{f1Fwd_52f{br-TpKa z9L;fy6mzWrb`Q7aR;yP7i+Hn+21Pd1F7mLf%pCyYB7&6zce0_Lg44AMhblOHxZ;5Q z<3iY4U(@HY_9f&yAtjX_(+FO=#b!&LZUzZ`hWl?l=mJ-7?Dd;5aR8V+HVqNqT?bko x8?i3QR6}yy+Rzlh2C-)qD{|JB!30;^z>@kW@LtB&8lx@sKrE{CS2Ol6@LyVSHP`?E literal 0 HcmV?d00001 diff --git a/pandas/io/tests/salary.table b/pandas/io/tests/salary.table new file mode 100644 index 00000000..090b53e5 --- /dev/null +++ b/pandas/io/tests/salary.table @@ -0,0 +1,47 @@ +S X E M +13876 1 1 1 +11608 1 3 0 +18701 1 3 1 +11283 1 2 0 +11767 1 3 0 +20872 2 2 1 +11772 2 2 0 +10535 2 1 0 +12195 2 3 0 +12313 3 2 0 +14975 3 1 1 +21371 3 2 1 +19800 3 3 1 +11417 4 1 0 +20263 4 3 1 +13231 4 3 0 +12884 4 2 0 +13245 5 2 0 +13677 5 3 0 +15965 5 1 1 +12336 6 1 0 +21352 6 3 1 +13839 6 2 0 +22884 6 2 1 +16978 7 1 1 +14803 8 2 0 +17404 8 1 1 +22184 8 3 1 +13548 8 1 0 +14467 10 1 0 +15942 10 2 0 +23174 10 3 1 +23780 10 2 1 +25410 11 2 1 +14861 11 1 0 +16882 12 2 0 +24170 12 3 1 +15990 13 1 0 +26330 13 2 1 +17949 14 2 0 +25685 15 3 1 +27837 16 2 1 +18838 16 2 0 +17483 16 1 0 +19207 17 2 0 +19346 20 1 0 diff --git a/pandas/io/tests/test.xls b/pandas/io/tests/test.xls new file mode 100644 index 0000000000000000000000000000000000000000..db0f9dec7d5e42c87dc0b0d297b66305f7af4225 GIT binary patch literal 30720 zcmeHQ4RloHnf_)n2>~nwMF`qz3{voigalBALcoA2M2YEup$?c3C&Q3V$Y5q@z`-Wn z)-K)EmTp6pny%Q>?%H&N@w@kafA9Cc-}`&-dmWzn*|Zan{?lbARmxGOveZCgqRMf!yKv84dw}InSwGE1tOrP zpY9scyJbi=s)%Y)VRaX-5nM-mT&&zK%B{g6lljDO zr-fOzTCaAh?f9z)9=p|A)vUH^oY)dmjks?f0q8uePg1Bq#9@S?gI)7*ca}nrBU`P; z3IUBn5G%z-iO&+OZ)neSeZ8=?{}Xee#W~q(w(sbKJmpq7*vyl-KH>Qx&Pu(AvRUn* z?r#i63)Tg0X>4u|?k@OaALSZ{dKYn64i39e??lOiMPVEGj`Jn@{EvFn4`eKQs@3lj zyP~n%sQ>g(Z@E_LaS!NoA2jg# z2V50LN|$^1Sy{u+8g{LNSYcQ+Jm3W}B$6p;c+~QtT~~PC4=IB^F1)&M_Vu%8FQ~ib zL!0NB%I0gX@LUCp58G^-(7(wSY`8OdP$3qB@iv2s79_(#ouKBzX%wc8N`Yn#y%BaD zmb;FqTw7o;v#Oit>4weMe8_4joL>mM>gKJi!zxAU8vJ3ZbKbQD&Pmr6B+Xa>=K(k8 zTudlPUQeUuSaQV*d(j)~+Ak_~I#B?Qnd3+S+sQ-S{#It~ zl)iN1hifO)bmK4Nc9$w3dDbEOoDJUTedteZzh;>B>xXH-Hf=j2rMGDNQ~1azDqVXv z;;MA`Fm}@T)1~R-NWmA~9+lUgmEd(+dyc+d-0soWOWG&s>-6@C`g&=5uD+^vrLQyE zvlD*<8!ncR?&(*#IqJ&qFj70F&c2zg-pKv5Qr~)2?c-_7!CgVlpDq6nN&YJHsO@(=&r=!y*py+HQ&yI zEVw^-^YcyncRky*{~_FaHtsw~)EJJx)pW=H{5^@oYBvgwZ{Pi>O99Sf_r_E|;M@{@ z)P?(lgzdfuV{oM_KGCtj!!^0K2lL9B@7UkIcIUzFJr88N@>JgR+v{id1LpK^T((br z^MfAFWr)OU0D%<`J_0=a2>PF&Sxfi3v;G6Pf@aeczgl8+4so)9Ya6GZXd?tqDcQ^n7{X7>DPOLMee=) z16q^Smbo78huMEl(UCdkX$Fu$9uN* zqyMFpe(rzn!O7VV{j%|D>eXKc?$x+(ZO$iOh&S;gzfW8Jg!xevBQG4g_vf=*Tc%HU z-G_U@9s3Kj#If5ROC087xrFa|N#XlsH_f{ol$o3HO#S(Jd~|2XdlAkJczoymG@OZ7 zt~&dT*MGk5uId$ay8T++ew{x5g>Grr(kC$IXpQ%{K8c}YC?#tN!$FTZu|Dui#v#W(HP(G)B$UcF&;LG`U`*Ka5QjN;;(s|pJWx5Z*RmKGPc zwzd}4b5v0vvYi8?#nmm59l@5^?ppzDJ|+}3#2N~*j6hF92dxc_fmqq3iOa&l-DLn_ z0o&2i*c>aWt`7w(A}!nNal39!q^0qmNOP>d$rp?+E9Q7iv!mOK{m#M+iz-Vso!k_t!EV|VWL557#Qa!i$H*M zQ&~yrf(6Tp^(`Tfh*LCH9}8AA)rSUg!gy?BFcfSl0~B+sQ7kq%E-N;OFEefD=EZF- zjX`iC2Dvhe4HBJp_W8hxG(>U7&9;>!@8DQr#ri;f6NLG~VM7W*wknK4J;3V;NcIJ0 zkQk7%}sdi9CRjCbUg8=uL0QZ)h<9AbqMw+3UeAfgXD zvM}>g;~mROrgP-BA4>7$69?TkRvh%@j#dlP4ULgdz25QjcejV3!LirP_4Av?z;>rH zHYwt}so`Tgqp@HE8-2k*?0l5`cSax?<gTLg++MvWR68nTL=Jp zl=p_@0NSY|@4XskxNR_t7r0=gQaDsHH23B&OyQoz)#2JpO1%pfAPq2hl%|yl3`%Nu z*v+vZ6WE-!aG-eRJ+JNcP+=d5FmAl%A;+u{xADKdx&DH*;u%pmR9WAJ8NYF1TD!gY z#v9HL6?IAicJIQm+v!CimBgX4AcnhSNfP^v-A>yLULAQLQi8%Q~x*4r|mJ_PU>Q&(rCW22@-sD~oAIS6c%2CW*pf zYFnfofKaLYC%+;lCM-Q?wZplb7=OQ0c=DKyx z2cO?~WNn<;#!6}%Gck4>GhTT*p>4OxTqf)6wuiDgE<34B<}zh!k9$Oqj#(8*V#(SjG_I$$8CfYd`nKVM66D%v4(}sLWAKRsTEX-k+~} z>NO43jS*8-;CStS)x7`f8ipGfO;w+mdUs*@p{J}CLcMfv$4k#F`i~WaN}Ok^b_;5j zhWgoEjp>8BEgTA|NO%5f9db z{2L9Gyqu|ySWvSy)bH(m?d${Bzo7BR)=(c-FD%-$_C*aNTkk;C4NrzD{{4(KhEN~* z)UUU{y65{EDy3njDkuYy!(G=M+AtkWVr)$(iz#=iTM_#JOTTXR0%@A9tp3$8)CXQ^AZ# ztQSMckM|?_%5<+9Z@@t#- z?28)1i!rVg<2lpR%r7;@daxb~FBGcR4U=!QP4h@5HPvqq(q=!4r~T|Ky}7sS^iY?P zlu@%Wv%JQzG&uHp%)V|nuPE?#Z9HcxpB9pBCb$~UnL6bn^~JQU3S(YLZG({JvsYX4 zQz@h@BxlpdbEX&Wi}-ZO;Pb(Vr%xkYNMF8WMBz|neHUhYJZCy4f~%o+JH04mYPi!U zN9U(+9?zND|3k;0B(xr$`{OxN>uJTAiA=*I6YW@9vq67BJf1WCljKa9b75{$a-4Gi z$I6-RdsjJA=3AL^vh%1WJId@AkIeI^94VPo<`SjGj2ZVqYRthgcPBMw%9wMrGo>q& zGNsP^t7J-@`B%x5I`gmiUy}jRnSYf`>6WBSsWbm7nNnx|HET%zRWhZ{{HtV2+mkY- z&it!nN}c&vkA~{Zzj}t`UnNs|Fey{&%)d&e)R}*kOsO;f3NB&?ocUMDlpe95N~YAA zf0azBGyj@1B>yUzQfK~EGNmT}Dw$Ge{#7!i&it!nN}c)F2}AO)ltVlJnmZ)_nmZ)_ znmZ)_nyaDO`PUih6A*%f$iJq^!0OK#R$|q2XJGXpU<8dGl7W@J@$(qi0WydmT~P7) zJ6TjJUdN9=j~y7m=S*3XgO6LGj8?flG$+)n-NGUB|h zG0+l;Mz+QZZr&9LHWlb~w1zCHSX<5yzI1z$ZZDGGSs~o>UXjWf4x%lpS&=h=$ij)P$$o-QRs@cq42i=+fm5huc46NPoPi^et^PXn0X4t$#*a+ z>Bq%T)$%Z0JAb&u=F7VJ0N!OpqT7T+=v#^M!i;+#Jihp-i}&sS+@^nbPUK3d52Fp= z^=U3vuU)m_D*08QN+3SXow!IP2GCF9Sa`69#}y!ZQoE}ipA+JP!Y}hnz)$K9YlJLd zX5wd1AZawHdD)X5JSzinHqhky5fzE|M>|!>f4a5X744L3cvH0V_pROTXlEcl633k@ zvL_tph`_A6eAYSAj0i174f8UnCy!IojFU-P&Cl zD=&?7R_!mF7D3%w!Ce_GFAaDJ(O#o!e`Sm<)*3F~p~#-M94q(^h5Y`0dxg8K75sgR zp9%TPODkhs&#X|F-M4zpsPy*9aEw*6y0)t-ZiU@bJWYE$t61#hJuMI}P!mYbf!Dxoo`G6Ku?foWNSUfrbFbj@Uo|Y7l|eZi@NKgZoBt z1#qd)1UG$ciJGpEw>&T2>+h+GmgfcXtD?1iZ3*7Td&!0JJbzE6FATK<+ttflvG98ABp=q0{ImsVSiP$q9o+4D2c?EpAv$x+RWO%@;u_t_DUal?Cap1iW0)8 z>8kXFyliD3=>mr?ZSAfo@pZH%!hX)GD1nW@m_Rnl2%d2#RP|`bZ@EA#~U%B1Ixal>9w5k?~$a zps3b#Rg@6vnyyHkCJ*)n#6>%aE{U!vA(6muMj*eok2APAT!ZUYl#s-#s9;#0$JHxJ zNG4`vh5dvH(*E1xX>Gs1-{=DOnrIR&Y9NUYd5Jp7l>2yZB+j--ob5s`wSxSWks+`2 z*L0D{ke5jEjwKSOoN=8%zQ0H2&;r!<5njAE4%`470anGZ)mBWp`JS0h#wfVKpw zq4Ws`yeCDxhP-5$*%4}#j3SmIdxlwo{6N0Y5cU(#crPVZ=2O&ax@0Xfk*yF%KxR$) zM8e~})Ee#$@#7wFXKMQ>gv6@0PxhQ6K#j-X3{wBcVt%xfo)Q;^fZa$EjvSO7@y)&=*BJ zc_p_gSJ)s(EjNK%K~<#XA_oYA$R?+DcSH=xoLqwN^iZIjq z(I{}9ST2szM2OhyAvMzjLQWooMYQ-o{N)y^RZ_1d1LA zog72QW84(irCf7w$sfumXR}5+|F10pDG&SUb*ZL=$GN0coSlJ0w*{^t@S>D(4aArd zU}xmKPFei-_xXW`@E_-yB5{;9Vw91jXLIjG(zS+6p&23-wA4a_P;AS&Xf-W2b-*dv zq7SqK)Bx7FFT_mrfxAcj@pTYW1`Z4gs-ohoMDpn!WOt;MxS3GRjo5CzAs7N-xiAJv z1ZcY}Bk)69*d!VqLoON0Ie=K(H&N7twmyUhL8`xTU}6d?N^1MOPm6#vv?Fo!7_`>J zK;mPKT^3u(f&pJgHlIis3@(1WDvJ69;zkoFY9pk|+L~0uTgXOFI4-;uQfX`H7^pd! zyDwRWo|$vFAtamH%sW~H_Bn}-cJir$PKt;M_a%5{*c0}14nuvzTei|m$P8k~5uEMs zS^PM+LLTLvXxvWaS4e`Gk0S|nMQ`HY#bM_d&p{Fin?I?IZ}GTj3t@N+z);J0QC(axd;o)SS=B=ZO$F6x_t9<)t+}p`}8oQE;Owf~+VY!IN>^`sAlE+INxD zcrTxh5(}w#;HEJQ~*~xeNp;T(r0n zDpE9Pj`%dlM_VD+q2z|I;Z1lRd(z2Qc^)HSv>yaRgeXTG8fWvoLB5jDbZo?iTEwR>*2F>Zc~IsFp@K2nM)&IE zDY~_IuN%A42h*n?+Gr6p)EqHVq{5BtA4dbq1m}FKO}~?}7z$`yOAAD{ z8Ef}}M}rUqx*mx)PFzLb1J5&n@S5eEquu`F0WaoVF1#W8X?dt`be+^MUggb&m|=OA zpy-j;z{h%nLLH+(F%;vj^W_A^oSuhoN$h73tT&UdA&8Sa%~X_dBqf5O94%I*k8=n~ zUM_GBaUq_3e?w!>IsP6Bervb)X}S#tYqXEvr@1SbgO^X`dAO=qtsu>J2pDT>sh(5< z`Z=MS*Jvl-`iL4(euy3Sw2M zE##nh4QeR$!)UYp-hi4zn=647`{_^UH$)qyMlChe8ttWl;waHc-bqkOU8eUk+DqQb z7zui5A-Ez7y&%^Y7cVtyhu(2$`fE%2{l}~J>$itm9%Q#kCxd)3m6U0bD`HIFM4Kd! zS(F^wHfg8qvX59(dsq`X#PHH;i6J+kDaVp4T!U-V!!axs`^B0q3u5poKior(7wMp` zh}6h<2DIEA>bSAr{{F?UMX3veQqLD4c666c>Zt)qQa?pv9f=?4Lg<_bgAc2)h!rK- z)TiaA1|Pv-pNKzkuIWNd2?*pSSEB^es(7E~8w>lD3bl?B8uHR+@cpZ3r|qXXT3+GE zJ2h>;`TiQ@xhrGgO||__LWzJ)C>cfZk=7%d#d+tOb6@~!|4GY^O^5j|)*HR3G zkuO^69PQTcP%Kj|;vwov!Qp5ME$7+qRj8a2?1`{SC`du2D6x+Ql{FDD#vsn;OHp?T zyu?kLFpNQ#sEN2(!V$woQQ;d+F#{YW^pUk>Fh>$gQ8=M-RF9TL7Q=mqY7T`rf#Ik# z4HWNvNuPP&+AU{lf*Bu}RddMSm-+a}p4xu#nj^y>Rqmy9lE{mscaZi$Fi(jXi2%;w z&M_BPb11abcS!azlQ{uz2${zC3za(T&1{{(tbjNA6@Op!E9YKkRGeFoBZPnmf9sRw zrRU~_nL#oTm&eXM!)#Vveqa_}*{+YjaBd>Z9IUdmwKp(JupD`_Mn$+;fP}+%9|}PG zsR@~K5Sn&~0bj;*z(HpB{aQ+>0 zk-`|>G%BS0{^ONVX2?gwMUIsKeq?ER$3_S)0Wr(yj8Mp5H>>r@Q4pdzk53gc5c{l8 z!elFczVT;&xYR2A`-9u@FZ|txe}nO3quvkxJA4VpkqH0~25rb>V1{EH%l}DudOolJ E|39OKD*ylh literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test.xlsx b/pandas/io/tests/test.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..e6d3a0d503cf250ac54b6fdf459a1324cdc18ea0 GIT binary patch literal 44929 zcmeFYcU)85(kQ%MDux1ux7|yD8NP~%cYeee>D{Egv%-$J>@J@ zP$GcxuznUBD3SEvZaD+m8-q91b316`}#UuGLw6%=D+@O z)g^|t-KtW`dXfB*7?6IE+4w3+_fpD;vIzgYmSThOCmY4647OMw=TqYCJ-T&bz9$HY z4ljb?>%k;nBArd$wo(+5IOKLO+=I zCUc)wZ#4CBiK`0#F3AWIx#^JL$%a}q?~n_)MNgIu^8C<8$Y8lSc*jhcKJG91rvW08=xLc57z}2>`d>UX;{AnuZpc%oAV=haJQW$>?i;A0yj%Z25B_onAFg$3glN$;-AdHl!J8U8c$_?`li@g*&$-@&FkTLepC95{8QU<%<2fX zyIaouas=cb|L#44;PT7gkg80W0YD7c%Y^Y&`O|?y(E&cLXtdAnPWN|bVuH3i$kYDs zF3orQy&+?YTZt#cehycf+<6lB=Jh@O3coEt-`P>N?nuoftTn<&-Rvl!~o?ZM<%Q+n~pp_w&RQo)5MM6R3&Q)6{WI6KX*h9%X%X^aV zkxirBd5+1yy>w9B9sbxnv#0k=;N4gE&*ZiEpU=8Umzq^)epY?%Rk=`?Uj&z{R>BA^ zv<{v7VJ1^+g;|?medyE;7tWDaMt0Y4n^eCMIQphDqlq92<|1#5@0sd++&AQ_AYwnuwJ#jq#=8$U9-x<<>ER~ z=Yhq9H*!l|Yy15}wM=40l@X%hmHzWRy{UVpD;`wDX$ynf^=>bF-e^BlnDFlAqID<{ zREVv8?0(7v4oE)WsN+n}Yk$FnER8ERO@k8l)@8MKM$UMBc(S~&^uFK8&?;`T{YnOA z@f~U%Jv82g3ZHh+x!O74n+IQ~vrk9xS%!a02RP>qo(>jH{N1#PD&?)~shVXt>vGOwnGH`*P=Bv&lC0Wz~N%QT~t? zdri#bcF@}IK6RFtF?_l4xZu?=1AFwHU#{3kAv2ew^q7#8e&Z-N%lmUX-!*bhwi4JF z8@XU4fyF|^Pt;UHlsC^1Rnw7#>JD?e>ZbFwS)s#Z^KIh^_S7(GPFmpoJ6~@vXGUG? zJ@VNj@n}*u2S?WYoon;yx7DRvE9MUOYHc27Rhw9tP!YW*MRs1SFODB)6cHe(} zP|LA1cUV%wF)Tr*U-#+(>lN+Gm=@|UEoiy_J(B442YlOr48n&603eImHHg5Vo4)RW zyAkBxJ&O#;91>RUZ^0CaT9$)#6NYWmw>Jha7F#@b<`!`+N8Hvc8blJ06F*80IbU7M zxv9H)LN@Ye&0%{|bB*ki#HoA%7f#2swsLE|r@}D>UCdfC_M=I08y&-QYpbQ74Jx}P zjrK|*Ple$UJ%%`O_yepJbogc#XE^i6x?Sg`7eQeR{VI*4#7O5}1ci z)>m}h%yHiQAXVaWP9J9Ay8xm^={uLPvz^%Q@t)wuHH?e|-6+d?>>j5t0hLw@3J zdY*&|E+G}4J8J1U=&Ska&f9}GXAe{AYwonP8(;dxa5J-4qU9~^l_N!7G}k6s6mE^# z&2_kUe%tG&CYfUUDzK?0(N81TXrf z?xRTV>F{*J-z#M$Y3wfr(@cInc$TqHnO^a1ymX8#qPy7m*5VZJjku1}**W9izeP&Y zsoQhO`(k@azG>>^+6;7RO~v64AQ$M0OG@39h8Ith| zDN(lJ%2Pe)fx`^(;IC)% zFNDTT-V332f}Y9d$SbH)AHV{)N5!~f59dXWXO~#QGG;c9d7wte{b3dFg>TyB^(^_a ze0Q~4oA4BRE~2R9F?mq;x(ly&OI~6qrt!Q>^MhLI@BGE=l1&{9Tf6Ryp%#J_@%`I` zX!Ewh<5oOd?mE2upMBTPdn)fgvpP5EjXmD9sJ@k|Aa?!*tqgN)r3&Xew6)b zLiu!YY2i+PGp%5Cr#*y9Z9o>nz$uNJuV>Mo5S*Xhul1{BWJQamc{`F4KJczNY5S0#56} zZ>{TpE`rwe^NU?ZT2jnsn$zDu%4~f4`pqB<^$tm9{dnpc+d~a81KWr48+t$P zRgF0;zyIvPb50F4FKZJmtPM{|KQOaTOx&UkTjaAaLMr7(Ld_$A9h zY11>0d!28x+*2^nx0Y$>%6q%FhAOrH#vMM;mW3uJsr4qiVg7FB#ub6W(dzx>=6pz5 zR-0Q2tkkCas=RT!d*2JRS<4zfa*L>om*zQDoD#k0$7$^HC8$I-=Gt4951owjvHpC> z+94geI0ln1*sueLR(xALYooz^h$pN*R0@+GlkM!SS3~(Nc5ZYU9sysHvwR z)eGe@9Q4C+-QHRG*!Yl9PK6-B4!f& z?DH!}uVLd?jw7=ey$wsXrs`Z#IVyLp$&n+5XRh3CxN-V)u^rh4Zt-|;%1C8fUe46I zrLO-&fzzi0y;`bUH@~+Kx7BbP3$?Qko){cH^36=`sdEKO(vd^3p=)3E{oH8E*4%SM z$THypHBLQN{!V*sc_{I9iq=I#*O=4rJ9!PFhG&cOjvYG@=c1^xdhZCuzsu;BkZua~ zo%M|Lpm_EDPiNgDWO~#&tYh4)!#&jt4xNc8Kk@n9{ZJ8I*|Q7Lx0CbjEDmzSN=RPN z(6@BY;|NN+tQK`Mf%eoY{bH0z7N@g%Y|3vgySj>snf`N0Wvo>)uPc7{eLWFPYJZkv zdyBoh=#zi>4foPPG;pqeR{N1*gfrX#EKZ)lk*_APk<7-dRjNM!6S{7!w-;fsrK?k3vDd$iM+chfm<>&pLu>Njw z>n^VV)x=Sz%v4}B8FIgX*OIW*Nnh;a+OtcyA`ZOtrU_a0ru}+f@Tc^E!+vzFC zi#vI9?Z;Qrrd_r)hFNa5YNoFe@q4F__aoY>HS9O2hH%cJB5<1@tq_q5?E0+ziec2> znaF&-o7C>!*2vY*%)|p2>x50zxg(7h>4Wf$b}fTjKBq*JR?XI=&q;6`jP}o1NKZL9 zvn=lczmnSUbxLqh7u&X&$(~-`VtQ`>`g^Qeio}fa+>7q`?5Mo33W3fYvlAOu%ChcX z;5~_i*OMo2=uhSp4Qi|w1vQ{ex{e2CkPJ!3&64!DBoICL!}B~e%;lJRe7wtjVO3Or`FeFYZ|%v|n=J-BPDl2$cu3A}$DLAl?iGx8!@q;4 zU(TU8UNVA9huiXJ-H<4g8glD6xv($!9y~{94^4@6M>Zky8~3(yUXQZP(Xg4}n_Pj; z#y=^sA#t-KiOV04D7X4w!EZ_JRyAn{Zzoyp3B*)bmks%=W#ZxP$DQM||x;fvD z$6?)9&6W+Btu6B6F0PIHYJy5_d)03`O?vF#s(VE8oAhU!e3|q+Wv-{pqh;@=itk4q z%-tparzyW>7W@(;C!fSJu!+tW`hlPQiIz48yN32gE}Tu^IQZq#Rq0q3dziJ*`r(HigcdO`8MmA=N%MOF9CJo$ zOP|WAFE{NsYA%n>@lBpOV%LQAN*r6e9{J_yh{*Gv@8|NYYD$C@b;c*|pOXhSWHT?; zo;M$`()dQynl7HPR}n0Vm7TAUR@`$E@84Lj`n%mr>^3A- za-9EO8+@e0z}4XEzUSgZPAxI@l+>bKz8K$^ z%E~%>?}kFrMEi-V`m$gud6dI%Y79zGE@kP#|@5?5~N8n$+m8yZLQn^u_SKm70AY z(%rI*PcCL{g~(Q(N+0u!o^$zZL*!?e#H`h~`rc zWi@WJ$z>8&AP>e5B_%j9EfVG>ZYy0k;Hr$qHn?gsxB{Ls>NPxrA55Suv@z8`S@4B^1_Z4lgop^Sw;#~PSbAah3O8{z z4758F`)a!I-eUj!weBI5&&k+}x{rY01;3fn9$Kc8T$HZ_bYM6X6j! zF>yL&W;OO3?-@JfkQ4U_Q!UeNA7EQj$1Bt9dDV!A4y@mSa0y8MjiBU?!J~-82W<~{oxY*ZV=2I%7Up`ophv6a+x_|H zB#Od2u8l?UmX2G=F30#(TjE{R@5*dni~B)2iNNhY_gSC{Y5Gg4>Wt6vM_%zWxhsP+ zoE{h@Z~H!d%(^d0zvKNrr6xX2VQ)AqB|_2X%a#{AX7XD3SYty#$)#VOM;$vb2>0pC zfvS7sv1A|Sb+&tDvtzbd$63Gj%SmEM`$f{7^@edSA7AD8+k`CcUJLwt8o=1fz&wD` z0Q~=d8t}(9ddqGyeNvMzLS+!gx8|R$b*I2Z=Mpl8&2-RcSa!f6$C>Mu;hBQXy9JDC zgSXR{svNu6e3Nyxb92-l=8QLG|KiilDN*v(;6)jBv|V^Tb>P#iE^TpOGgEz^L-xhV zrK`u(c##(31V`Fd-_p1yaiZn8GGOrOLuX6ix_bH~7X9u;ny;V2(vq*O2Ty&!+n3R_ zw{=qn-wt`iy=i!#qOc@d(m!zh^Hjx~{)z9sLl-LKQ11@~zHFTm4i5J?wu*mptor-p ztNgEZFV0Y{4{~l!+(|f-V};PQt9vWSL)nf`QA7lWYT2*3-_fe8)e(+X{>k04ak*EU zZP>L=X(pOCeDJ}PZgb(Vg;-DOWnsKZ{7rwA3g&D*tR{P=PIde_7Ya+8?u%-Z)#Kr{ zCcF$Ct8>hem!9Uq~Zy-1)Ukn!YreCpv#mHCg0(aZ!&6_l@V0z0z0r%asisp{nY| zm4|WkR{Yez{k1mYMg8ZpS2jm^kDg+#_#UcEP6Xo-7LLTm;6dxJe#++NH^CS%qmkA_PdQg18#%SkXCFYUq9Wol)%jV;+){7Ma@ zNkpagFsUHx3rE_nKk2fLSx-(i7RU`ynhBF@KYA-qO-P3Cn337=8X~}YYV9o~$8Ik0 zD1a}~DCL5J!ng9XFSTbM+}5|@KE>Mj<9`04n(`TKF4@;sMj7IzS2H><8?%+{TaO7J zF$&wa_A9gPqB~*rc{U-M8u3e%DI?9HHp3+lPhE0%oeg{vbeAF z{Q0w}AFA5NCH6Uu$nCKBqrSovD$oA7T@m_xB`spLZ1aKtl{TCL%;fo z4J6Zizr5n{wSC+dvbt}p$?r<0(2t+tFK~(O<1S9ZtJbPY)_Ze|Rbo&tX<^JY-@7Ax zoi ztg|-^rd;g^=T;Dr9Pn|am?ST{b53~N2VTDRu>sYFrZof3_e(e(x7qrS0aTTLie^yg z`lB2IVB!yiA0`68?w>H!b`9-j9e}+YV6&x{#>@2s(zF~G#c z1eih}fI((&Ff%YfTH9C|o1HU)IskzEsmt~2zI&tqz|TJ@z{W&h>e6KgDfV&bS7uS@ z*DW!C%f&75`Wb8MbG!5J;_g2GwQP+3c}YL4^q>3x75l$@|Aa67fpfve0K(bj zmB-D?F8r-M%-_}03WA}rdkTEe=DT!uao)HE8bG=P z4ny#?m%G{S8bkar1J~RQjUZSNf+f9zP5(xp`D|c_?QZ)Ybi%w0EFl=8!yMr4a@HDx z#UMB-0By577sNAjfxEBC?pzRk=AP?8)_>!H`G>!+Ba15#o)dC?MEomp6z$%b-hukqHDtbXb15`(NDE z8w9h8Tyrtp)vYoF!vQ-c7r-4r1Flf*2KWOzfD~W@m;n0F^Ewa!^>_d%z!z$9hkD(i z78DaV-~+Y&XFTgaaQ+Ns5Dd+{%ZDAHe_Jg0=wD-`VgisT zw}A`_#LYjXME|27qVt!;5YOm;)&LL)Y2feDDgCpada=zSfAvUxVrjTCyssAvSp9eU zW&O(fgY^?skF$RH$9UFB)-TX!95@55fbSn1`9WNv{#c2?zwng)MTT|=GYD{jY7&Hp zhIsuur~k5|9{#t*{?W6m3HLNS_7nT%{Fwb!wQyw{}NBpmK^ZiHvzsQ0DE@An-JB1_LIMH$|A{foW<h9b0ASCG^RRaGsiGtgG%E@kHLrmQIC;_EB*M^~U!fP0{OK#2P_Wngz7+#Lr1 zt2Te^Z%hXU{?Qg>0RVbGp_n!Jk2Z&30C*7yZOiKaXgdnUL?Ju?G-$d72Za1d588P9 z0U*#r?ZhA7nfB>Jv1yvYScT#h`&|Ioxx-*=7cdw*g%JHO0MO>kAOS%}7?cM9SXk^v zedwJr7A7$O`qkqP&frYqf5FB~N1*_|+r9??5>Wul-*9$g^={dXuS|bE_dMS7cMK-r zG4tPH;NdPRV-hgfJw5y%m0cqLM+9~UGui< zhnCmvn!AUm7s@*@C^#e(b2BV9EIGezk^Xx7+1P`Bx-n(D_&XZT|9MWnw6d~7ukDLdN zsFP%j$4OE4kM}xOP{k;(Lg1mdDqJ`DW$&ZOsko=z+eiNoy^a8p@;h9~~z_d*gxwKaI}G00a$EoP>!gzee`8X58Il=XWDl7 z7(o0fN+CUYAQnN%A4dF^EMlhbpXaYS7H3#S$3U!uF+SqCcRmY67GHwO3Ux>*#DzYYS9Q32XT&h?VeulmfGe3 z3-yY2=`_*LQ5=tX`*}YBz~!26s|ala(EFkp%f?obiRBBmL$CFhI!RiKpvU?eA(%N%#>(D zP5r$xuX|Cvm;P-fqIf}DJSXDELr{$Ya4bXBogcuzVMw+yvQ9 zN^h5#aYhl%ZWOCR%Clmko$xb1E6Kc5tQ$Uc^j;f0Z>Ze`*9ynQ9S8_6P^q}dXR|$a0Zf0b_DTkIFeoo?w1~L z6V{7JK=Q0vB{WtP7J||pTkbT6m1wE-U%f^z>t36pnv#yBX8?wC?Rd}%g{NhlX8(6Arr+(` zs@~50U^`W=cMQi*IE>YxB&K53sEe~&om{YSF&*kdym_Rb%hC6u4$jwP0Y zuPedWu1K-rwUt!%u3W7o#Wk*FEt_<19W#1IZNmv5G6S;u{|N)VG=0#D@~*QDezLWS z!W&nv;@b=(`!tz9kKnRQAx|t!8nxt3lR0bvs>Qyica0A7y;*os#K#So&8Wl?_o17) zW||g>0LFOX*J?-P!N#V>hQXkD&)2UTJ}!OFzgZj}@XK7i;+L7Uz=x&xU3V>jVEn%= z2YVNRXXhTxhm7aMgmz`&M9bAvEkaFRe)6j+1#h6A);;20ZnTofM7E|XIZ!5#C8f33 zMfKI=?&VAXx{-(&`f)0vmncH-x20X8R>rNj?!%s-RCZ6ICA{dO!!#Wf8O8%9c1`pq z_>l*LdD*cSGrG5~jY5j7HMck`aZ%l)DgDdukWGi+F552vbT5u@AqtK6kyiF&>NnD; zD;bX|p($8BGPjlqRX1N{)bH>};U_=eA?Otu92x%F#V>dxye~B{AZWb!K)tYP*I;vp zR~y0rSJmvp01P7ULq>6j0enx0;$5l~V*rE1VjSrR?)Q_&%-hF$TGzDZk)Roj?&Ss0 z-%%@fxVP--0wxR~R}4V>8wKfXBWiq?O*uVTc`c*^yvG!YUF|_4ex!7ROUoDx#}; z_(Cg^14HO`{^sa-?~CJ3*4FYlg~9sVkLGi((g&j??I~Ov_dg=M7yuvEn=;nP0E8Ao zwldm?NudmYw^@H$2D~Aei6RG8H+@paV8ds`pJI`b`7$8-Ii|4FS&Nd8M$TAMS>_eM zu=gCe^SQk45&C8r(;EK2VK}=K)Nvg0XlWR#&EP_1F4#sCp2u5Jgv#%YZHk7hu3W1d zoeeLmH#@ZIiGnH5g|tzAeaP{ovbiwjE^ zCyZRH#$00lIq<)>M&^AW?|Ep?c~aSY5PPnU=!)crC1%MvIqC{+OMUuKz4KV zpWS%@Mko`HUK6z!A(LEep4c)kigRm!&z= zhuc@ZAw{S+VD0GpslvC?o64OqZoOjPHu56KLK$}ML-DooPuWK!<*21STvIaT!5J~v zy&8Qd(v#}L9)piaJEr50N7ptk!#mN%CK^5Sb|>E%FT6Y0s$(i8*|+m*R%J_22}dcb zWdPs&0NAAD3CLYfj6s`T9+GYcx#xeuxM|Y0GNIEBWA&umukR-EVRb;iz}YEJIzIzo zJ2Q(&4pWX-Ne;dt^YDbjJSyB%G)w2^y2Od;jo-8u?T)z%L**+K%X6O;rC$SGLpUEq zyu&&J=(ad!Os(vcFtG(C+@3GsMHHpLoZ^T|(&+ui=Fc4$b6b;VGA+~1^`uQ32Q?;@ zOzhG;hs!m%#V-thZ!}-zdX9;bpC{JIwWm2=0OJcj)6Bq=Ta_Jt`@lSM#QlM8=!Ae8 z0+~Bs6{&=lgRg3jg`cc((lXbwd%0L|f7b+-SgBn8LSwds_{GR~Mgo@108m!U7?m@F zy0+wUT$>myfp(GPWZ&y7I;IkNXr4IU$N*y2^p07LY2buVTGsPb#Xn=+ioLgtzE0dT zD$faykQK={e`MJs9>rr|xl;>gUD(9$Q{Pb|C$>LdWB_R@H-$U$j>46}n-$HWH%b?6 zF&-Pk22m4=FdUU&vaOFVhP883VW5jVs}J$Pb1h3UTBGHLr%Hz`8|^O1fW%M0I6 zVNX${3C${nAZ|HJG*k^@?aRy}4&*$!vXeh_xk5!lYiJ#})ak=It5~!75P@(Yh#2JoFNg8(AfV0IQ-9lRuOdTUB1Nc;fwi zd}-ZWzJ|-El8;b~Qfv&aPb^nnUudfk4!SEhBZ@jbhKoW5F{z#P!i%sR%^dcmW<7YZM3KYB?ajxc z4)x29lflNjHz|`pHa9|tql}(l-3RHXK*x518hE}8bnD6IAE7xu zLpg+|>I%(1QoB%Z;o50PHNIbV`9*)Rsrr4J3V=d6iu`3v%B!rb0u+i>KdkH2NDt0t#$1$0TBn!ECxPj3bu2dy7$BEn`SOoj-O^^sm#`8#Q)ECE zt;u_2q3FP+qSBeInJ*h*Of6rFCZ+bN2s|>_X4@D$|A4u66z2nrkCYtMjd<*nuL&Yc zd=Q*#$6x}qv#%OKd?7nUIqexdM>6{cImf1ZxhA~){CW%Twwh^EzMju{r$EEIeU|xp zZWiH>_nk!e;i9u0vOCMg#>mJH#}+s8%pHOlO0!jPt?7|fkF!a#JH=^eUK^`@E!)*m zMWj8Dq-QMt!%uct*TN_Ki8qJev&&8s9ZX|yscsfv-b{a}Ek&j`R@bxO2SGKp!(p^? zh5;ny3MVhAw+exQU0Y6duSrdlEwWsBUK%Zu48SkpRjHY3)39=1`_27&bq0_;tu$OT6e;-w zeypj!X;4KY?_>}I5C;dk;gW;C>*AB;S{BvKF&)3&F#lLdAl78Wz>k9__SDUeejJGb zq|-!6i+xjwF1XAfTpDFBhD=#L~WMDNq9&Ea-^&>YOUv7A9 zXk}?d4j>6`kU&=1J2T4>a0RR+y%S7)M)aCy8-(v zp%d13&YGWbDfG72Y!jtMPUEWM=-k7RhdQa5Ggt!Kr}Z6^j;+uXH?W_p>W`RhhIP%Jd279zoOcLK2zHX|8p|H^605+MU5N^@!`FmHV#L z3O`Mks+G|!THsSjCUSTo`PU+>oChDjL@zyZzPWU@s(*GUdwFV1GFedC-N<2P_pp3+-4==4j zp1u&NI@D}NS?!wme*dW%sYy?(Lwkn^*^U6_@M71r;80DPV3(TrbySugA3FhS27WBT zN>CoHq|+6&;%H1W%kVkNswS#%GA+=pbbb-{0(qtcB^IPO;bReAwlH!3Qyso|K1=n> zf>_#mmu8*a@WA0GzzjhI8w`d%q3KZUGM<{xo{uQOc$&OwI)LMhJm#GhL!N$j^xL*( zaWSa8YSEe)hFAXC^Yi;?aM$hr!+c%L-&@(igiX3wXE#wCei$4!rVb>Ot3UGTfDR68 zdB`YSO)x%<&I^h=WEkc;ahXQ19m8`f7m|07#uoOy`6>-VJgwxT%RznuJ#&^6dZJGW z!24X+mO4!O{<@HmS^9R?uBBx}-E_Hot^=Ra=>95rbZy=$5BChC{GsEsZ(3bT|NFB8 zbsBEc1uxmFa2@C;xB(Rb97$*vLASPsa&`_vJOj{eHiPmR`&$g)_ZcXqmfaGj{i4dz z`JJJQg>S4tWLV@RqB?tbyMJ`{-3n>tmLg3FO-4rH58~V_li$uFAR7BMN?(EYR0iM- znpa@*dh(ep4s`SgfvKbn@FKsXm$rXXb?4TLT=KNHLwwGlRNY9FZeee0HLM4jjF+Gv z1{pw)_CUfl3an=_BObSxE(PK`;gaC&O}&97DrJDo)=JenfA$JJ*zdY2NZvb=mW$GhA{Suw^*xYj(L$(8cu{vTKl-Dar8Ld4<`el6l;GBQ#iH!4T))qs*6~Z> z+b5U40Y3=hSkav#ER3XQ`F*&{nQf4+PCL4Dmo6UpuC>}eWd#nyUZUbUBV-wX#qip) z6*;@x!l#`9jBDF9`A;7oT|)yX4pQIq;gg8HYm?(&n}&iDVy^OvQ&yHFs3$tC*d3iQ z!yS#0Ib=By%4=(!c3Pte+L-Jfg48Ghi2Uq)e5blRQW}HkcBo4$D(}sppQPziSCW_M z^8FR_+RlljywpA?blU{(bI_s+mm!l?8}VgjFEY^{#?kCo0C`2=bDHT+PV*_sF%q(^ zMa>RGP~Q!=IiCP8Q`AUf%ao<5&Tz=dD8@u?VlR33Q6O=Fl2Z5EE#e`)lQFbG4si`{vf8IPC(T4XK0p7zS|l zQFt3&e(W()1mn|#<80O;SxrR14|ox@KnrTfsv{Mdiu2|LXb32L1i{j6Q7=C`LTk&T z&Vdyw6P2PoT25~z`^h;EznDhzPJzb9rmi!q&DzrhXS)%iSQBdC**vmG4yiI8&lkLs z(t4m-M=7y`H>H01)JK9U?flN&X1(pBS_!p>)a|xp3n{UkTal@uv+C6`-(mcV`imBn zYtX`u=1Ti)JzyV)RWpF-W+Teofo|=wt{uC93t$?xa^nG2xECgN86((+uSie=5SXNWSf%~5B-wJ>*t7R+FA?o4` zJ_#;?LD~~2b1AJPS-vEr5Ij8!6KF$>yWMseI%eBd&pYvl&kREq+y z{KHLyicxUOK?X3B+1PWRjAkvL>qYYUXe)}z=0_d|yXi36wQ6cfG5OdcdgAOHnYFk+ zCe)dxNsWUmP?5{>O1Rp`#VG$+--*<+610A!zU3p^kdmD2z14xkBDP1NAVaD3B zh=T9~%|ua?c48v(AnlYVrG9oL0joBfF+MkgZV!jWP2i$y{K74URj}qHJvZ`~ms4M` z7e{=R+#9PXnvdZy48q|hp(eH0CdvboN$2RKGTSvl;iuG#C_HQc#UB+uaHV(5nJcw} zSabhFES0?*#|NG9j)8;u2dQ(3#7mMA6q)Rf*%`DTO$~IUI?j~jk3bH0UnXZ9xAd*(1BV|z`4)C1W0l^=_*$%Q5x<;f)bI%dQ zYs2^BM<0D_kI;Y)(+y_h2Tt?Wt!Idzp0Iq{xOap+=4eT9I^g8imaqGI$n{fNO`&Dx zh5D+G%V@dbyOfTx!3ihFFMYl;rg8~(m-jROS~1zVM^|2i?tOibanovGJgMaXabu(v z{DhfiObGXE(wVZ!a&4_#*dv1;6qPjHn-Q>E&1M3N(~=ET%a&qH-7g6_eaeSo zH|U1ZSj>ueS4VBa&P)8l}Q!m7=*^3aFiT9n>|W*_P?a%B=Na27qG zjoe2w8VTm+YR8MF-yxTu8^?r_(n?Zi{M>q~@cfv{ZRfY@<#I+qA2_>xzzKU9Pl9#e z_xWTYxmrDNF~ogX9A%}w(~y$bo906{bij2X#uv%FZ4!=WTPcLnY7)E@|^5=@131fE^f3ma}H z^0s-%V9C#KjmV63s?O_nNbJR> z5G28>i75sU8~i*R4;=$oE3p#732u~&)cuVbSbQ59bAF3k^hx>ZTmE} zBv~@PpywkdH7LyTVOn)NLG@0k(nE!iw~}#h7H?nw@%|Rv-F5E~=2_%~){I>lRm~Yn z7P^QDM1Cw53MDW|)(+w=I6oA@B;?2m)s*AGutWztC>fa&8}7_iNocKUJ~%A{4N9)f zYpUc&aXkOx2o`XwT*@oG=$^tO4lq6b0k;jTFb4P)ct#Bg?$-Jk6qRH(o53Q5NR(N7eOw+9MA)QOA1B z@qAw}{tH9oVRA{LPW`U+4aLP?>^Z%OD|$QsFL zYDTw%WVO$B=p>eD2d^tQn~_M+q2B+VJ@eTrtrdkApxMeFa^AHaI0oUGnT5%QHkekkmjdd`9NqlGi zQEF%`{n!M`VD>#xfUW_WIPS#P;N#R#M3JCJo}JkYU=M+HequA)eMDm~Tnq+Gez78j zbDP-JeR6NS%G28s0-fp!(ink-;a(UM;Rx356XKf6=h8L=TP=PZnxva9K^aShD?|>% z4qydIN^WF$+6&A_S^!qM=?oZ7A_4{|@>}frr)@#F)M;%64Q@lTqAyX{of+b|=SD&G zNUP^>SMV}5f6QS>8P2_c;0cEAn}ji;opkPGG|ZGf46BCje#p3SxeZl{RkE%em3cNh4LWtQQdQz2 z)iL(%FoCrexo*M8A&1hXIpJ8seysf`UHcc9vGzKGbS)T4_43XROH3K=(wa!PQ|=Rn z?s~3wg6_AMINv6M<3xB8C&k-wu;z$aq_9`3@EkR*G8%y@gG>Yd^j8dgG`CqM%O%3PdApM@>iFY%UNUe}wkHW-H9gXh(Cr!2Nj z^ryd`u{wa2qsS#8yz!#o^BH7^invc$NG%zefY=+X0ysE~gCSdb(p_}5p&AhlD+y>0 zYDEqyl~>UFx|f4}%F>IZ;qPWaO?>rNWNHnCfqB`b3|C)n`>!8>UiVIFNk#TIT?yZwsePp>Mg9`AM#R;pZ z)_pV-)zfkW&bNpZZoZJO1iDb{$tqE;D0A*(reJO9d`2bcM&3+q)_8)cpIPZdbA2(H zMe=}ETxs7 z=5`=^rcwyHetBea8rWD#!Lu>|U%k_jlaaiXiI?W!%2onQs(+0Gzx`)x-C$uwKCkcW%NtK@_Pq%~_rRo}rZXAi10M zs6$*ZPH)8i=HtrAWUGk^Xal;988*xy(TmBvdoW0&PtE7aBIS|dK*XtS7)^4gg0LU4 z93HrB=c6r`szN_ZLxMW9XijioH(A3vm02xsUjB zl*;I}9r!z6jr zHHyN*<_!HgWscmNt}8@pQQ5EoE2*3-J$Ny44OL>Z1Ih1z>D#vNotA7G8tWJ%937^e zFT!e{xTH6l&vbtqO_SSs3fHKN>WQWlk_oNPK?CU8ZY6%2WfY5;tz^X*6Wd#BBZx_n ztOFFsjg}@01Ud4KB7#@FU!}8ke}#IBUAU8Z4?(25X{5${An`m*EYIgQj|$mpX>mK^ z%R(!~hC8sg{~6Z24qVwjtP(@#M6{vPB#%?lD@(EGs;MhIeYgbZzc$ztY9l;J-5DK= zNKTAZw=;JOh_5(9{v@fIgAUf&!S!&zSeTX=Y|~5Zo@NCzW?LufHyo+W_-giEUXJ<5 zJ)nD`4;>B;7lGMrNXi|s@6rf#kFeTtC-Q@&%n%uzb%T7A*()?swu$Q5<>bLF-Yrkg z5bK;d2IDo8D9J1X`_2Ks=Q@yk2nbNR4Swu31=q$qq0&hZ!qYj`wl!$}pgE=YG<6^z zD@?ZHrX8TbpHOla$sg{|ssKNrq+Pjm#yo?)y%1yD<8XF-!C9TE`GgwY6Us5$k87{f z&M$N!bDY*g&d9alCPLd=>+P3eRQq0SSSks;Om0!zc&g?-L9cuazqKJLPaWvwT8W}! za9v?m>FbqzG`r!%Xk9t7PMS)mq#77HS(|19u$c9f$EiFHE#P z!F?8oP8UN*k224|CVfby%dAvpv!O$&=Y}GcXv{+db($pABn{3&JNa=P#zk|aws!P$ z>Yc-EcTPaJvsfdl<2eetOG%9`(3ioD>DCqYE5`77H?k-Eq#>X)L3m#LaP3uI6zERj zBKfyaR@)s$%L*24y|pV7YErl7U8uy2CD*$P-#^HEV3wA2t^C}*Z!Mi|_d)OD&&WD@ z=d?8wRWq8b!V9KutX1#imw>3*mAy1Y95c@7K&J7n? zdxfmM-}iZ+=QnNLmXR3yMPA>0!5z`{HxJCZMV~qR=w=56-|vE>Ac#*3HMP++cR_UL z-t=Mmt0|AQPx@&}0g>KbvoKL;L8ucWUOm6dx*S3O`Aw2H&i;6!(Ic>}#V}v1KgWSD z)o}j#3+QYDbrxT6cN2Pbrm?-cps$eGQg9XL6COwaT3>)a-J5V&w^q!FAX>2h9xK1a zfZq)OsgGfn#tTc(mb%Ficb$cEsBV21?Ab0hh_Rzz6Y24PKr`fP0AOu{s;QYHk4IVg zeoUK_k$DSYS&V2|^J(WJNOcPtu#HiijIfmmca{o&?z(D}OjL-tJDw3|%HZkm`&NKn z`dnTvHKLHNhO|ZVQZwu*Trs;1u}#?bF<^)U*<3_?T6G)ISU9l5!|WLnJDgJ?jdZXb zE+1&w_+DHe*umgiLg9Vwe7*UKmpXg46ktwCuFHNCt$uIIm(b1F67ed`5%%*Jp}qHm zAlh(^o4=8=F3j;6@~+@q1?y6)d)UgPv60>VDj_YJJT|l1LBh~@lQOY2Ut9J7$_B7i zZhAk_X4DbK`05n&nUY1ODX3|HYK#?RG+yR~!?m=K6S@to13`5KO|y8VY191^p=gcs z0e|~VnOLMPtP|vjO*lW%(ieDbq>Keu zv4iV?l0-z$(?%J-aE>7T9O#RZIS-&+B2g0@{cEFXK^X(*(2EqPTahOjns3>EQVBvE z@YK_YCfIR}x9$bSQ+rQ%_&yep*Tr4~uje{Rn4@jP+&JQLS4IdaJLTl$6eQ2~~uWLZu+jxc_g84bI zREsw(Psw)dINB_ur5<68cLZDpRMDJ3=qU@~*!nA-!)d&eFFSOF#V3QFSAQ@+xNz$a zqaFu4?Stwq;`3B1767>6GC>C^hhmA=B4=b1h4-L!1EO_+D{`YO*51V~!2}7%&P8Y? zx2Gv8a)tIV=SV>tUO~5uTDXdikxUDS=I=%JO$enjy6gl18vNySbVF~a-|S{^Vkax; zsSYdZ(Rf59&udGFcs<5)R3@ZlDQ1~K(S(pXrK>Pz39D_o%Oke|?vfDog?{kAVBM!! zwDV1{GXj=G{(Ep&wkv+c^tP$sbuaSuZpn$`lK_WwZ(*84zJr)K3;)aCc!@ACy&m=nzGYMjebX5p7PkEBr}X`@{A zRUz{3nFn-+vZc8WCR6P(nv$3;S_F^=X_C{Pr*KhfDMUefbA>w}M1{F#FZv{IP<1kB z>7qq&s5phP5^eJB(>b2NT+u=$5&vOgV=W-y=VF=T-HPy;i*hkR)r^k1r&DZLfFGbq zMq>qw!%!k`gQLK0$4U(8zJ<=7eg)M$YJNYJH97uT)g*aI*`&=SD5Me^G0 z-PJ!2#lElWUCR2{V#f08qFR?CCZBUfOW%x70|+7^Tp1!reavDUc={Nc zluXm<9gPMl&e8XyT8PE;RRz0HADE`ZcOWUs`W(PU()L~@uRTAfKV4?{_9 z!Qu?(mZl%ymqPRUFnA=}#m zMJsC2Sj4?`$Ha!mXsqH6IbO39azm%xM@7Ubx=n21(RKk`qg0qbL7YX`5z{}X!_2vo zv5pxXp>b#~Y&HJQeD4@jodj2aO2-@9qiGl8=2c6l`S#UyfDE~hhAVrGUtcDpt-&Fr`{^n9izb4K}QF9?25Y?@Cb%iP%nfR6#Ojb;)`<1d>0)=$2!D>Z#aN zuo2?q92__!57p<8_M?vOS)9{(q9Ku`>&&llVV;#uxXe`L|0V(~h zaJatOLm(9{8xzP|*)J((NM0I+@IGM7^77%Re1Zh;5J_;XgukB(AnUVWHY->)L?F2fd~Lvl{ap_3mLUP~GjuSq-B1^5syXupTqo}4qpFr*sb26 z=!NxZt5~zUD`-n`_P9lfl-1e?B3H2_IW2W2}*O4_Dj5jyB2qJV;Lkh7Kd- z0KjD@lLRs1+{OuRMJV7#i@JTdEp)u#)PQs8)6`lE2+-oxS;-&98~^PMD)^Jy3Lg4gf2b zlt%rR26)i_m5+#vbYoOop$U8@0CsL@=Wv5Zy1D`BOX~tf=S6W$kp;Rjfz%3g(2HZ~ zdXWy_pUwl;2(e>xsJCPxuGOBPi0h(eip+;a>mUox$`4XJK?bE&Z;94;K0a~K$H9PU1ZOqIJyxmC4j#s48$Rem8YKwxI$$3Q% zTA=xn+xKHj6egSO!#nZ`N#gU4ciBC*yiVp=Rww5%J`Fl7GYQ-`;7IunRsn(|TZ$L@ zUyM@{Q+$~QBa-eAs@>G-sgU&t5Ts>1lt*65J(3b6p?X8kNFojXDi<bOiSl@ye!l1fPkreVcCRlmxa-@2E=Q z-XV2(0Xn?aP0aBzwz8N6n7`wfw| z-w*z9I5=nufOCFe`8o}d&&itT%UFJ`4V>CHBdvnmqSfeb1)vZDZ1HWseWK&gUK*TI z{JfcX`;^$&$;3jycI7BW6PkeXz~Lz zpJs;r0Kek{VM^Gv6u$)mfMkdeso_PJz3zFyIxqD>&WUcx3y*}J?pgmh#GZ5HQU8VP z3IF-O1Lo#%6jq<`qKv7h`|~Pw7Sz_oj9~}!fiJl8L_Buh#IKcJ4U$rl!EB74-pJyU2e_ae#zb44E0VK|y$dl90T!(PXL74;9!cj!_GvawQ!HR&7cXt0CiUUo zGR8gSo)O{UQvkhvg&iBLjv-l(z0b#HcjTO@h*TcKJJ^Y4QCt1-KX9_e zhB|xfFiKEJtsJ@;9c7u2f2erRv$l>3jwt37vFzj|*b+D%(r zTb0t1@01$!PS1DSrst)lKlLYctnesyTK?BFS3NkdO=SkHFHj#sXtU-radI#(5!(!l zr0RQv9QnauYz=B(&iZE{5cLQ2AqP*{7%_GbOmNgdxXNKI%8iGWM<3s|-yAH=3I!XQ z_IQ|dp?1@Kqvw8_E9}3Ewm;L}uEEa8cxls0FAjZ}Ag>ZRBD9(6n)l+`krc?T2HOTd zsZ|sR(*KG0cmk-*ym;wCrrh7?0D9`)?o*0?>z&tDe~h>t(3mijXh&ShiKHdd4akQz zekfh(K2v}0-t!IU$%$=4Tk1HY;T?@}UA>mjk7(Hg)JN8vCR+X6H0P%nl^N@yX5WZt z-D7CK?qyz^$n@nr-D2thY4T;Xo6L0Nh_;Nf9CK~b@^u@c_)8%7GcvptWP%hG!y`&T z+dJ(Y_G+FXN|%dUc|<4@<;h4!kvhx%YA8aw9J8167Eg<8hM=SJr(4JlMQ$U)ZSw9M z+dlGPtC=4ib_4KasTJ(G3pVA;`8gb%Tf%yk{A@liYmRNoTA+>|@z|}Z7PF2_w&x`h128xAk*wF8 z-7mGLSVquI27pFGpfmMB@{Xa^B_G6#RQ1G6Dt=Bd`-!xCifYD7sk;xw$_r`jS@+jJ zl2sEra}w^;(aXJoO-)Svlqr)&$pJ8(C= zKPdgso}|A|DA(S+xr?|ATA?k$@S5l-$`;f|K{sYyhlm1}sZ^1dG|S$^3YX)vu{B7g zd^vrEkOnV|cRES7eQ5kEf@YKrnKk~(wnh>%r+)rm+nZ8fc5X0y$hCQ~{(EMchpKBo z#U5R6Zy^^;a(sNUBpx0BS>kp?$NeA@J_^~8THo`o(5P+ph@IV?d*f?|uSdB*Se)Hg z{da5JfA9rD_nOzNN%j3<^6^q^J2RbXrL)Fcu{L@A@=@`*(>_kt(B^#U%w9S~foH}N=xe<(a!gI(hLuwunrqN(ef zP5wUL6B@=i9ief@gbh3m^WY{p@+0IjaJuo#h8!CQoiu9upEje559-bE8HiMigPef5 z?L4rZ{SCDD{|le6A@=Lw@R^>6Q~YHi>PN$_QiHIm+YZ~|%h^Rlv;x31|6z5>IqCC! z?k~_$IId#{OuBYq4frSK4;f?OZG&h=6>B)uqIGQVIO{_9QDk3kzGi&tF460Q$Vi*z zOxcPK9PWqwt64|e>9zB!OiqM=h+1?us=5-kx3aC!Xc<4Fv7&=$8;m;opa; z8Hybbr75|cw52RMDCo%)M|5B};xp^}09#kk;h^^zZb+R2zhN6IN~P>ZZKneV!Q5`} zNoq0;qqG2#t^J=ydr57)QE3a&xL&o4AvVL#RAXM+67hM#5^xR^#z^g|tVUjYJbjcY zL#M}8%mO*e>F)Jcu+(QRCFyjsX3l6;>T^H>Fb zw5T7S5nS`EvWex{M#tCL$0%|W~h%BFpVVTiOX`LYL3+{9iZL~`rTZfiZv6wA%eOICZe>uBXw+20qTvBw@6eqx( z0CeY-3Y5QhAhwq)S~5aDT#2n4W*Y{*3B`7g%v-o*l%oNi2-|R?FSKq`->_1 zFmdLb`>Yk>ClQ-KzU7fb!Z$FreL7xX$_i9+Iyj~*lDtryP2LE1N|e+rk#W@U#PUb5 zdn(%z0CC>|c_I^ws{+zmtC0+BnX@jZ`eX=tTGbKUJY1*uH*^eU&MdnRG0~AyEOZ3( z&}{|GS3AcH3xeqx6j$uWS~R5G?+UChp;)00kQKvn$c8CnP;(g?SMdE?bdHTLh2Bo> z{&w=&8Ol!H-ht8)0p@opZZ)E)h{sx!@HR=;4$37L7)n4kK;@P!!tdz>_7Al*^?{$-9xJ z5Y)?gQo@C2fsh>Vqaz|y2Hhe?px0fEf-{A^0e&l-aClkIv3VW2@TU>I7>!f=K7&a! zOwe$lCvAH^OA<((A_BTy;{}m`RF8l}bpdbyp~`^ZRd`$`Hb$rAodQ`jsXcfSz+|k3 z7T=AJkfOY@3G^zMr^AT-WTru1|EDG=+$YYcq?Kp{B+ncrVAB&Fyc*GKR0%JGZSx2f z-(^)Qd$V<0qKfp;>x$3=-Cc2)7ji2V#3goo)Q=npHipNvXuC)_>=ycqPIuzJBXc3g zAI1Nu_gVBzz$)0=7n^e?v>raly%*^bR{UVvl+7SLwfcpTR#}2 z*6<@ON8$*zjou7P61nDn_AaAI>Wq$G+s5uKazo^6SIuqQ zzXk0&Yrv|+2ch_V001;Pf<9ZHtn8t8ZdM91*=|FC=x*|-E6?XCo}hjV1q?H%}=xGfmOc{shV8d}2yXNq-p#oi^%Fjgb@wZz9T$vG6Pr9*TvXQZai)GlBjTO?FC)LxFouF1Y5(GYtBWZ&ls> z6@tJ&=CzI8zPY|Iwkon=7ttM{mHwLLQG6>5Ujn%d3(qLtFchR2sXW1Dq;-MW-aED( zXmY_}MZuVT!dRvM4raRh2z8ko-SGVmy50FfayVkqex|)>9sHycG3xw*>|ZlJG5uuB zyG_pyqiB;Zh|H#m_)_#*0)VC`)@ttwboQ);E`Rh5!dFOhbl+luK&S8pcK~)MgX7x8 za3;1BA@HzRIgQI%9%FB}XC#KR?E8Edo;<-pt!iJ8bZ`_+Z+_5!FU65s8T>J-(tjjP z2-qeo9Z+tf35ea^n{Cn+ly9&I{(dmYNkb5JhI@O0r|e)V&_V?hlWVXgPCUq61}2IL z)KJxaH7xZ?F@R~m;KmkQe#<|2XM@q3F#~C?Vd_2OPo$-FXZ8<3PdHXlf6nmFa2Xz? zl(olBKizWAXQ{8=E0cD8*&81j9 zS{1fA(vU$}yBpYXpFEcjX!GE_BS=>^w)a64XW2|~^B>6Bg&My6PF21A<^%AHjPk?; z3%FZ(@Ky;#h68RoCF-`(pH~3zt@g&}{ouTuJ?huU+EScTF)(lEx5SxQ_1|;O27@ce zM>_+)U+V_iDR^8iZKHAd^%J!SV>i&1npaL>-e?!%F)0@n!bjV!>6Y`+@k$yuCLItI zR68_@WJZfq(9$OCng=QnX|iy`gyDe7HQ~}B?qPnwQ?6f7EDryYW(h^~&lj$#dnzo%UPzW(H=z&k_1f#1KFv;4wRn|Iw95_I>M`00QJbP7H8JkMYzo#jHb0UmUR^Ub_!R4=&R!fh0y}c zd&n&ssA>W+7ikVCJSyF{p$v-YNQ^`-9u1fM{tNKpNNWWtF|X$4f2(nQbE(t1m&9|4 z^{jTluNZ^EP=SDK-RF1ry?Nr#PpFTnSGG5Lz0~q>I8r;=ETgvVhF9$AM>S(v(Ew7bkHk}`s6hk%o;P((!DMCNDam-Q4 zOtgYy%Xk@KPDkT)7SZ9p8bX`ckdZfXDa1cP-u)w<^iup|;e{5K?O)7H>+ZW#m&J>n z*{)*%<<_UH7S$u+I-Aexdr07Hyqsi>GCM>Y(IG`o{5~`T(N|D&gn^{a-j!(nM+E^} z2Q%`5rQAA|@zf#7YFx2Q`f7WZ7+;1mlz3Atww{v?L_@h5@ELhUOG13G_IIK0McErW zd3Y;P4-94|uKoZVNUPGhyrKhOKEDg}0a3h_J02-`rs&NUkCPs~T(V?%M>r4r<+GGTBDfOL&c)d-@bm01MGl+3Ay4xoUkrg@{p z2ToboW~4;HodM?UZUSWu$Lu6a;lKG9O25nEc#Dv*R2X9&^&bpJ3@cr(CnzK9oE=^a z@4{E=3|U4eQ15d!Q=)Cd^Tc!q3fimrewwq294BCf$cyS_K(`&upqj>j-w?S#N`l<( zqc=PA%gTABD6K_j`$YPHRT(lE8@k7yelkBkra5HUR(y>ML=X3>Z{Hr_up`sv?(L0d zvvv(Tugl%(cs4F`4!4EH4}Kt9tLSncs1U z^-l+6<#wwf)fscYQr>#G#e(%&{cSyHbhBitSp!9H4sn&MDo3sZo5aAyfJq3CMsM#`O?0+L$^Vaw}=$u@G@B31$xScT?)(tm)&&FG&|i0vVt5uvvt zo@b8j90c4PQ4t7Q%_2vXs(eG4ZUb-8h^Y3Hk7PjJ3Zg9phN%wX5|nomp1ejr1w7Z31%+6u9R;fv_p#U+3Zb7 z>|Mlp?iyloY*4fol`5Vz6~Dh4o?EZ72SE>884H4}>QAGwH-WaolEn)GCvgh2Wx{z7 z3aN}3w(YnJ$gLNg$MhuX*(;QtQ*3=i+=aJ88H&yl_@@)BOI-_BhQ)FdD*kjse@z9& zv7!En5OH{>=pLVLrUwnrv+@E5nPow-euW{&i#`SjaT>Z}?#^B83`#?&a%*uZwCj(T zqD>GPINoyr9sL8`Gc^?uOv@#^1AFq3rhzfmx>RO$)x2+cU1Z6PH6HuU`uy+!hf3tc znNft$>S5F|&auS2fxU7bHrwFwplrq`wE`OG{*W6OlWK$WWoP25f~V|RU~n89%pVB{(3%2%bcZCP>Z zv?gsyEHgFC5ZH1!cx=;CQD2?CAu*M>oNQDMve#~R^s zWQCU(7K>|#uU$lF9qM3OtLmC)6}m=2OxG<1jIS6p1MZ!8Qxpv4xfE;sG8R{%&Op9L zuTbnp{ghX*i*f;;Bysj@XD=W6gvWQ4v+7=tx(IRPu68NBAHd2L+zYN?ya_|IEGlv- z+osqiO(-9}4Y-TLm^&=(>`mn_VH13r@I2W@(VIH(o?{_e^&IB3>739|Wm8D~C9z|Wy^JLWx5P|jh3$BB zd*BE>lE}*pQLvljn|mm;)su)*djeZ3@8x1N#J#ijgXo)~`4B1|4R0&5=)+N7zl6># zzG|Ii18!vYs;<%xe+B`jqH)LgcY(ir$6H>VU1_?~%qJN4uaCM2#fKxGWuIpc!;@_e zj=&hrv(^nrjUZnz(-*BlLd6}unV`yqr7Bd$uUYN^L!hS%0J1GRn3jg^FR~e`xiCA2 zxe?R&M~2VV4v`PIjbCw@73`u^qHbSsYx(jKZN`|;ZPOy~S z0c%qZ?Re=LtuGT~&BUk--80y$(Y#K!-6#Se!}M4$yB_4o$@lqzYGxaM=?Brq_@FJH z-jl;01dX43h2Cg;_ZpC*>BN6FNjW2P;P9-_x+HpffE3cZ_+IL`v{heoc8Xmvl(ZJ-6qoFwVX;z>*T3d{5MC=q!k_3*Np2;TeMpW%Z&f43Qjac}jL2$_U zXb!!C!nGx$Z`ggndp7$MM5>(9t!_w`kIZCMfPpfVn+Xg|x>X1vjlBjsPz$^%(KSMM zxp6(vPFfR3Y086npfru!Zsv^obr96EaIJa*?oefsZIras3F{_p?SHiuA~%)~P(nA! zdB#zVdF>qiu*gUa3EJ|qo}Lz53bJqNd%0@L;*b=$K@#$Y*|u~iZX#Ke=bm3#?#5d@ghmqFo z<#1eEe0=D6l6{nV>-(_H$nfJmPd`~_GhRWnaojISCPeVtUpbD z?%ho#-eO8xTUb~z6%2sX0SHjL+Hdn8qvbHN5X2wCH zMe@QeaMr#>2y)&vVBIo;_Gur+_=U!GJP|EHD21RKwZ-Dmb%?{rF|?-=OcliRQ|Qr$ zQdY8e9b$Bij1Pa9nBc1kWIqQlmHC33Nzj0UXpmz4MmHqd0JJaEP;@(}{P#~9eWcXc{A|4313H@%)@!!Yqz`fPk zvj5`OQxiE6w53nxQkYYW#UX1F?FFz<{(c6R9S98D`3yeQx%1|e2bEr`~u%K6o zBbqpip)JF%L`zgDq1r&PrMiKIifqW{6cd1tbaU|;x-q5?$IYBdsEPFWG~vk&9-z3f zIERqVKk^#w#b8i0i5!dNwUb69A?ED1t=4)wE%QDC6q$P0Sn+d=R|V0$-jIg1oJROi2f z=1^Y}3YuV-vc;<%q?%Yj`|vw}0#Ol|EHsceAf3$s>=CR8g*6HVydij+T)b=uphvG+sU^!}|BVAk5Qdi*Q! z44uk#e}{R(!x{IX{mJTrm)NqqtSp(ADw9IQwhwWlt7%IAhM8hH9uP)ISP3i+0aafm zE3CPjX> zuN59;Q=wqnnq(l3%i$Q*33!Kk6q`-r6~_S-KWlXb)&vPr-orbXZ!s(Ghg#f%g|c_X zO~YnYe1!!G?L?a?0A-E3caTOI-tg^8K=aqJHXo~h3iD?zfPKTNp_Q5HUDHYQuaqz(We={7|W^yT4+ zF&u;U`NTw>CCJP3LyyYlu3a&bG*$K*N3k~#$HZ4N72F)R%4LyQy51$0NNH1~3(1~(1Vsce@~^mZsg zeN2R21K9Q$JM{syUVEFY4=azooPWj5smCu)xT$`I0t<_I#KQX83fhB6;Ly%b<#rIW zGUSs;ScBDlqZbnv+6-|t)fzHG{S)qC?zP(A1=u>xkJwINzd5B@hrSq>7jlzkp!3uS zK@!Dd7}Sv>CeVNtA?}hMPZ_sOkQPkJjG-;Vwj%o;L;gUVNdw6T?a_X1euz`2gF(T5 z+>~gAu2b4@HeUszdbITnQrcK}JIxYssR|%f$#4Lja`hCwE?WdrA(TYBh^y2#5)m>R zAsc9RMR1m1pcqXr&?N!_r|li`1A7x1{ji&qLRq#xuPm~GvOY!MPJK#yLwtdnLM2kq z_QZ9G7SYXsl5|w8X2MPx!lSh^PCCG}SD|OXxfnYFhTTxq>h1)}JvyxW4)s>@({&r^ z?;D4c719)k9n4gs*%mi}Stx5f(AI}r(*H_78%%7;ngfupHdki_k@~;lj9KEPy3LV% z8_oqH7wj#k4riJ=8tnl+lw1nrN+2D^;al`NaQu=KMj04YvmgCyx^+z4+N^)PuTHQC zI{+UPD|>Uo@JLQYPMwzpNU=DhG{Luz>>^%Zzu+$wZF(<#f}iz99D#i5u}>z&0KJM& zqqv}S88h2ho1lje>!fTOP=b zQKeEAKOXtCS!H$-6?BdjUdL%8;7KfNjG(<%C)R$Glt{G{p91qsUDwRNNPL%v`k7`Y zdSh%2Aw_2hj{Hz@j%a1jUX2f{`vmw(Bt+qqE{@)|R^*}^MDSeU8hE_x>Z&+N$L*_} z4lVuAaJ`s2=ANqBf*Dox{Gzwu26w61sFUDpbVx%Lpz71gDx)TB`c(7Id!`R$h7^pC z7i>89wDIYGet+)TzunidC@8~K@81$wbp%#rfY)b-ImRM$s$xLV?*au3eQzwG&xD z#@ag=Ri2>l(;ylkA4ZfdbkkFCXhnPTdL|GqeQaOB+OF`)a+?MIn#zviSKXi}8nJF0 z$9S!d2z&(Wa#!7Fy_KjC*wfxOI8d~ODi_EK?p4FXU8&e%xLdL#N3(;$dF<1LH-;Qh zX90A~gOn|FubOJ)%#!3@@rThH3vn4Lh{Q~YHK-Gtx`5b;>KTi~t6IUV^Ig_}oS!6& zzlb=3Qq~4GjfplD+OF|o_fzeCl7x;*(h}6Ki$nOSZXD0`xXdjNrCm8I9UYliiFPUM zy(QLyxCBZuLuO)LU2i;JJ}{5N-zJkZXR)BBLCR7SzYe+{^X$-nQM>zSeCsxro%~iW zXl4W^Se*7L#?(+=e^{Tl{Pwk0CoqOxQq!)H1OiXag?MWWsw{ap*6)~ zpujs@|96-)`v|DaZQ~L@HFK1tnYXeWJ4Lg2W?sQd=Qx~2kAvxLr^Xh&sTh%n$BjC5 z-vV`8(3vXqcUCZf7LT^`YsC^8F6a$(xUmi?=exJR@zJn{Lm4OF?=QpF*si%n*69d)Xwk4@4~h0#>-h;{{j#%n@;ZeYH8!5UBje?Y_>l?r7}~0NRJ0d}`}q?(FVL z)=f1BxqYV;^5m;8Jw^?_7A&JIkR5EHkal=S)}VZe$QUh5P&OZ80gLl`xNMkP9&vRf zZ60s@;ifos{;6<}-gDhc&J%G5HH~OVjW8zL0pu{0i6iDv>_A?IdtzNqTo~M&Lb2C% z=*)qx!(DScs!>;X{}tNZ?w~(NS2Oaem4MRp(e!2Z5u2Vg1h8NY4jb!Q0lZBAYz1MYHyEl!8dJX~iCl!~|!BDwmF5t*r#Qb}3i19O2Ept-@@zcXgIn91=J>TMVm96auqpKBzGA zM2~J*#{zgKx%<< zJc8oPTBK16U=WRC0WxcaR8xwTu7_^LHpXaZ6H?oDs}J;RN3ss4B`_(N)=UJ01%528F8jb zvjrS!__QjJ&ww(Hy%dHcv@WUz+a5X&dUA#hIrhODe;MESc`=Ko$bE1jLssq7=71AD zsvDZO3_VN^f%HCBq*5>7Z5c&QXmn`^XqxbAZc5l|L8wP{NZZ6Fpd2hXJ`C=)$`bO? zJ|PEb+@SLXvdIh`aq9M%p+HW&CdJ2uZtoH~jEKxXG#vG%Jg}NCq~g$KA< z62$JC@X}U%;$XAq9?BEgzx@umi)h5Wzi5^d0Ic zdHS-Rnss{OpWY4+YC7g$dLoo{`X7l;e+!w{N`w#bvp=gjrb7Dt2;vfQq`()}$g$7V z+Hfx0I?RFt042(stI5N3J~EyF0A@8 zItu_O(rgGo-8t=aV|2YnvfvH|;M_2Zs1BAUvsZN#LmO&gRR?}4%2Z!Yl5~hxA|vU- zc$Y`WO*t*SUTlr_wh`?xI~mQEb*WPXGQ7y{<*oNV1O}z<)f@+H zBVWQ2ZX~eu`ooAH$ap6KX(+Z6%A9?yhc?WY&ZC3Xhz+D0&n)-qlj06 zGJOkxYhX)@SEW2pLbROX;K^3^$AFOc!M%_$G~DM|^8Nv5z>O4^?{&`Ja(jm}kY5jy zdaD2UW`oJ>{F~=F$9gVqew%J2(b=FMmCID)E7`sIlUcfT=y}cx)}+FxtryRD%;#Rj zet61BBjT$m>%O`xu10sYWQDHL5m*}E=wFC>Z$1Fu(1q0Dyj2Zm2S%wXsPDSw;#ey_{Aqs%z{E7!~4ot?=3 zO0s565kGmNV%-AV`vtfC7|2s6-RHbi0i;JWEumsPU)`<7qGAxREzbIaQwac(_|a{b z%@|iQ9ZjG4N`mWZD-&+eJq9#K55nh5)V{e?4nr+nLQ`K69oNoyuKhRgo$QdI!RJ=` zlBi%&jzB7G@KAB<)kr{Am2WY+GU{qv27muZjJ}}tYLQt;GB^rOqx|;Z)6(v>c}u`V z6p>p#7x1IW1B%@%|MiuwV=pSUjD5QNi1E0)6#d3G_Rp%*5nJvJmvW4^PxX!N=`kt2 zntXCy-4A^&=1Y#~WtC!9@a<&xjLE*R&`mVk&{tB{Q{YxGY{+WQyLz5?O0M;YKK+Kg zdbLPnm{ArvcCPf&_kFt~BW-&oZFRX<&fZzGV*kvp zogkZS+j~Bo6nT}F9^35hpWLx+JScJ@wj$-zxXDh_WW~lyr_NNMr-%E;|1bOX(ioMS2dbj>efFnq_HXu} zUF(@BAgJ(p6`@4qE9!0UA`f;4eRHyVW9Qjlb`cJuvFA!!?Q5=APc`~ixgV8!Gs+^_ zO9rZoMNJmYL!ScFVdhC2R2Ob5h3Bug_Z#AjBn4{{;vVEMkJ}bvCJfF4 z&NqE!Mmxqz&=Jv;Xknxu6)YI7;^iQGD6(NPI#-|o?p20Im zcgvNA*JUgkSN^m?v##9(WY`+$Ix`h7m-#9O*niy&{(>v{vl{<=*Q(wuGTtypUL6+d z89MB2SPZiKipuQEcqvsDM}|nT+I23mp}Ril2=zBUAN{S0>8$PUzT_#vkFC9Bwg#ud zr!s%DV%o8u-MY5C3flTt3+$~gzhUf{WV+ivLf>{kUX*w0Li83lKaJg<3rp%A_X(Fp zM)|*K3=VzN&;Ua}-vsX2%MEACFGQE*PFz~M|2}TZC`4@-*y8-DaC^J$a6#$ljK<+s zMWk-|qk%2hGtmHX)Kx=Y(h(m07DTp``uulKfeWs18>x4fm5Ye*{&Qc{%djaz+#jr@ z{pUwyO)oK{OQy|kbr-#(cRaJyncna^&0zhQv~t~Gz`OmYlU6Fv?Z19(kqlnEEP?qz zkRDq$IJ>2Ez}an01-dnANB-Hzd$%-3SEDe0Y_{&hx}+s%AElkGzp^G{?3=#gLreK^ zl`HRY$O%rwL_$*cQQM(Ht{OmClJM$fcS!qc8R4L+#f?_aU%cWvl6)*}t%z{c?~RQOUA)CVh{D4sp}nDtFdzwm*7!D6GG`&6-&`k+W05s19IIm+kh_4zw;mvZ!-sh39u--Fx^aK!^+^+6gnj)k zxy3cF_gu4<_($&TP^L#ew(-Y-C>f0(%l&(xoSzMM=VG3U9Hzouf+u2!;M>I);}K0K zDDC)UTPo%lAho8fCE^&3w~OVflp?EvVb1apv{TvI=wH+Hu&gaOB=YK?p$AtcQg^%c zcq=>0M_vjHVIXrR7ZoKx} zZR;ZR$EgbW0&Dxn5o#L4!r*-|F2ssts@ElK1Ar2;xF2M_y(fWj1^0jU z^0#C^z@bczL-vzA<2g{Jswus13mHs4SSiYyl235>WDO@3NBo~&ZM06wLxG)3Zr7+e z1oN(|XP3p@h!udB&a3adJ@a#?)II!;wEIoNyAE8e_<#Bh|Ns7V9TjHC+7Clh7^z z_Zot%C$8A;Njtdg&$|vPupEwm!D*K`Z2f{0k=?o<1ECnxD zsRLc@aSlKzeX^-4o(bq+{lIqp-*wSS-8YEW$M?1hq5=A3>jI^`+}A=<;AnF0I~NUk zpX$-2XHP13=(XK!#a+JkKiYfpcqrHJ-)UsBHIgk8W2=-hNJ>P=c4diD_QGTxV;M%m zP_~q8MJ3tyE!l1eF-Ry|wid?HawRiS4aWFA-0%I~^y|Ohz4!Cm#>Xu0%sKD#p7*TJ z^PF?WFHUw>oD|`p5kd$lO2;DaZ66l3Jam*WI({IqIJ)EEsr%Yf2yq}+lsyf=Kayxg ze`^Cl@^U^1=e=eEcEil1|y#yQF{zlsAR&HxPw}3IT0V9`jkeg!Rilzw02CC z%0YovQZV=H4>kgHxrr`RYx(qH_eU2d;xcd55pg)h=pyHvK82_TT*bNfp!l4_$#{@ik#rIL7I>BWr`h zZ0R}y+O@Y9?s9cz?uzb2n8?Eh!CLHx)Onu!D^OzxiKzrS_mqGnWp!yJgR^=Cnz#Xx z{%haETIowBeOc5;BRkXWKKdY$v4uo;y|&1ToLwT0XW3L6??4~_Zd$j{cIM_(NBDfP z)+U!b)#O3eFXQ#f?m~#$?Ypue<>rx`J2kA~s1s!@PK#$nU%#B(0$Q>%=kks?!R>#% z?+d0!=>}A@=`q#haSX5D>Bb=83-#Mu@~0cPcT!_6WxtrW8^NRoj)PseT9oDUk4uXh zF65ye*)^L~&1KHH&$!!*M4_i{%aVS8QJ;+4%QoUYcB=Nx zv3&<^sRQNdMib9@7`~veMc9Rl_61ysU+#_S_o@o3<8$B8OU-GL{yxQFDQR^yOgJb4 zRA){lQbZEcRP(bKvUFmui(>T%XI0Jh6i^Iy*t&7H#5u3m@|AjSR(DeLYc@TY)W^`9 z_OVFd$A_vlsm)9KfpVfOcYwnwu@*~{LP_t;$~_q#r0fGzGoLSEmFf{0#PkesHVHGJn;WlrUTArcFJzezH1X6Ifbu zOIeGo0m|^iUY+JQVzF(a{6mB}iRiCY+Vk42Q!K!9`G-fmofM-YP4QFNPuwdaL}i$8 zEVHml^i)iGNYYr7P{AE;n}xTO0jl@YL7LjK8r^8d67yUb}n;B1SMID0&LQTSl8jM2LYB21MT;!od0S<)P*KQFi)r zlJR9hyosd%@)KBb@{ghvNd-yekC~JdU0doIO>4h%c-V25FJ5I-NxK^CWzd?(%_22fTGu+_EgT* z4A(cR)Z*t?f_AzvxqERw);3R{z5Q*v?T+W)5z`S9nR zeUhvmW9V#<;;hViGn3@nM$@0zS(5l|k+qj!KX0jCM} zb*{?R6ViKoS`w3Y1T0PRtx8rbaNOD^FpS-mNamhZ!4dY&w2Vyx#X3 z8cDxoK;xHe+x6)y-s)iD=bavJgAX_SQ{&vM-=@kFKH5w)MRNPqXjG5zd`uz}d&TAs zEX%9hw9}p%eS|8gDZG9(ZFBWML0veStcnaj0=G(F@}LEkVs!dP-R?;?5O7tdRf8sE z`Mc+CP#a_YyFs*q(A*i8@gzWVVk0K&!}CfNy`a3>A_^`q{eHpzuoI1f8&zE^XS*Ou(40WwXHCIj|W zG{wq{wi2*mj;HVIrR1hQKhx$}Z1!{z#@hQtzY03@<%orjVCt#|q5LT9~#^9CRf4 zW4#J+r)EP*)Efh&h4M7`|J(zcjmTf$C<@eb%~h6iD5LqcMdj#@X4fP~sWJ4>h#Lp5 zV&z+=uJE=-b0j&C6!s{1fsU70+Ph^$0I-yyIbd5uR|^P9hbbAl| z#h-zon7vUFK@T3U84VMX%lz%UR!q}klWseR>I;kz5w0j)kP!x`-7SED|d2{=Ku>21^Nn;}?%_G`%KfRh7?Uk5yQ z_h()u*GRw2@(k-QGFKUjM7%AHQh_fCwJga7@PK;l1_G@?JE(l`!e#@+med&7K~-r6 zYcInxpDsCl2N?zG>Ni1a#9uTwfUyko7M2Jp1RCPAP>wL5eBymz0!a!Hk8um z!1j25;hp$k=AmwdG9SwHiF_Nd)$!(woKzfZSDQ!k=4f#ps9gG?@lG^pMQ;*kFpMi8(A0auy2H8WfA6JCIb=K@LqGEfr_WtGHq7v>fdw(Au*BSZIje| zS`Txn@*?5j;uiV#0y;L9Z*actl=!~$I-NxsJH$4bc|6l#JqsHoHK~td7P-HaGEj67 z-Y0<=>hC%Niy7Z{k&H}pjbb7OB+Pog9B@XNQ6HXpQIyO1ajOM!PgqQZ!tEQN z8MqCN{k_GW(vZsAd?2Pb17)9O`1;M8;CX>KD^|2DXTgw~{N=c#LZj4JzpKp9Bd(y^ zBNHrpPfbB*14Q3Zkj;!akWOQEWiWLiT=OV8I@Z}a19Z#p*&!+TRr=)>NWA#na;OT}rW368-s z$b;H0Gd#78kB)@cr%N1MEDu^?;;L8fYR_HQk~ifTOlIXYnY@vYr+k^BxgfL^jM*i& zR(Tu?dDKwnA?{L;m=&h(Q(tK|9&8G_tm;;JkI8Q=#(pU7+^883JCWPtp8U4eP;Be1 zCmhaSUSBeoYS=fF(zv-U;bY38vf~ToCF{G5%R&TK>V4L@Hnalrz~#4etIiO;R1+W8 z%(jQD5}$4bAz>h+4{Htw(BiI@jS#f50*BUkfID- z%lU3QSbUYEL5_WTm1Ja5+{eL~syOoS=8e7*3nKSaO&`<@R~rndYX}60y)CzL7{f`W zjPi|Rmpr~`I_S10<*AhKQDV5@O~&nU$1xRvjGLpt{U5p=yfGA^XDCc4aXfSZ&Z z#iNqb-Nj9M@nVncxsK*ZqFO1&hL8G2 ztfyPfj*a&YA+Cw;wGww1J`k){SgAL^jn*jp`TC*cdwA6;<+#ngk^8mKRgbwz!nN!; z1!iMYzP}e!*mhj;)wW9$IhY6+(`1$%w$DzeA^RmTN(Ta;6etEErh4W|yBlH$X$QOK zrp=lA-btr+-cV8VTSokhlV?83+&c~U)O7;{ZTX&)?eSjN|7Ybt#Wx_{+V!%mVAw2T z2^oLUpaPwcmKfNbY$({-Q&t;Whig{&CBAE6Rsc7>7gc-wS$nfmgk)i9!Mv(E%Dj*_ z45lt3Wy6vGb+`ZSfuL(D_P2S8iukhf;FryYznp49J$2Q^Qet$V%80LNZjFy^!?+q9UjQ}=p zdyEIhWH-@b1}~L#rXpFf<3(9BLW3W8iBI_%QrACWBy{XwR;eqa?0#j41$CiQBMRzI zE}LyPPTXYpkoCN43hwnGrBU7NG=1u^_d0uTmpy!WahLilpW&TU?{*~TFSAvbSyp@x zeKwwqL{J_&&7YYpa&!EKY!Sk4Vbe_AyDTM>5gUI?CJ`SPeDzyk|AdxL6irR;bNj1Z z2ekxE<#VqNhx)NPXnyLLjSB>t^LM=p+prIqls^oON@c8*C*a59K<{q-lluR9XQ_#I zCt%$PwkI!mV7;wZ`O{c`J%Yf_llQm)g4ckdIo2U40H?I|!n%25WLBSl1l&rWxxWAa zbimJq)&ag}T{C#U!xHgcu6B65>*~3Gibtz`LG=JrQW>8L#tF#v3l9fAbcftmtL0Ce@d-cDF7!5i3a z@FM>+^2s72UAF*D@kaPZ`U~=`VIeqSaahdSv)s=RtSrX(-2%f%{ZGTN*!I(5e1`>q ztUbgaH0qYX-3SO;{TEnwF;XHB^lx)ZaPq;~xu0`(lX3CDI{rwJADxyz@hoKjPRt)& zm>)5$xAT9(Lm+$hyJ7sx>i-D6UblV^-JAa%^dFkmSbrm6Gzj7Ze;j~$&K9kH`VTX7 B7yJMK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test1.csv b/pandas/io/tests/test1.csv new file mode 100644 index 00000000..8774b3bd --- /dev/null +++ b/pandas/io/tests/test1.csv @@ -0,0 +1,8 @@ +index,A,B,C,D +2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169 +2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967 +2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952 +2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227 +2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917 +2000-01-10 00:00:00,0.836648671666,0.246461918642,0.588542635376,1.0627820613 +2000-01-11 00:00:00,-0.157160753327,1.34030689438,1.19577795622,-1.09700699751 diff --git a/pandas/io/tests/test2.csv b/pandas/io/tests/test2.csv new file mode 100644 index 00000000..6f914115 --- /dev/null +++ b/pandas/io/tests/test2.csv @@ -0,0 +1,6 @@ +A,B,C,D,E +2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169,foo +2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967,bar +2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952,baz +2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227,qux +2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917,foo2 diff --git a/pandas/io/tests/test2.xls b/pandas/io/tests/test2.xls new file mode 100644 index 0000000000000000000000000000000000000000..dadeb7c2453afa8ce6be9c064fa210647a316845 GIT binary patch literal 5632 zcmeHLO>9(E6h8OOOnK9mcBZAMV9QtnEwmaN7h(){_^C<)LQ0K|F_Gyoh00(_852z; z4k|lIG%VbZn8t35#-EtLLKCMu7a9!-3tbg9hG1kc5<^1!{J!(vv(L#)nK2CpGiQ44 z&w1y*bMCq4=f3{G zU3rFK9AX5>GhYEDE$a1m(U$6})?k5QnZz$A2^o@?(ND>1IPvOXH9RILu9}s>d{GVm z?Re_)7cFo*Tk(0&{>`x{u_Z!z-Hh|U<mRp>|^S!z!(z$PCTUrhm}ag7}Cu6`9$;!JXfhlLnQ* zE|?_Rq;y@U4N(hh)C3#3-u@Hi;FfIU%X}Qk9M9o8$!-AT6adpPf`gi3%P6R!a%z_KfWtd{OCnP<+?`QyOvPB%aKM zH|vT%ZQoR0zi_>!w70e0i1pWgBm6Sh6Bzeme4*zA{;myf3XG?v8^!VC$k49g#x=mATD!ObEBE-r}K z+!$h4TX}LV`aKirsmbG$sAeB3P??wXog?7hd?2OfT?iHSb91Coy-M)pF7mjWEs4Jn`dVg4ahvx<3Xm&a%zH!GQHmX+mc zgnU=B+m>Z2A$DTUl1GThet8A^KZ;u~zx{XNzX0QUCZyzW8=eHEGW2@#neeOKKXcG0 o-YX-r6B5#@Lq`7tQJX$n{>Au8Z?H~O9{;%c#1!{I7VE_OC literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test3.xls b/pandas/io/tests/test3.xls new file mode 100644 index 0000000000000000000000000000000000000000..f73943d6779517e5448aec76fe1b7cfd526f37ae GIT binary patch literal 23040 zcmeHPYiu0V6+ScGwa0NDcH%fs?6ni;;n*SBbwWZOo5VbY^C$!*70_g3uM;cAj+`}3 zB%&Bv{uD~Vra>icDW#|nDDM(bX%i#_t>m;7C8DGeg(^ir44tNpQ9^@{x^1gI3qcM) zNoymsDGDCca7q8-hB?i1<}6utZA=F$v4YuMCUAo!>&`g}5bD{`+}|2F+6KJ*{^&>!}pAN8R>;zNJPhyHCJ z`mH{6KY9G{pAa}TP>b+GTMysypuXs`+>9&0>e>NojW!i$%I*h`EoaJshs$2ANIeHAIlk({Af+~_#ej?6= zJhpx;|F4qtmuV+XXG)w2l5WfSU{-HgoRO-knu;3e`5r-s05-m*XG{O3=o7v4Tqesa za!!@>moW&WwYDCnNV-x!3c83h4m;jxRMU;DP&*~*rKfJGJ7l!bAs3ws=rjTGzXWGI z02O@<`i+U`t@Zp0))=*j3f5q-hzcx7Q6fc(t-vZPiUaqJMO9$q7FB_bQd9+Q7DZLy z_EJ;@Zm307;K)=|1#V(RRp2&UR0VF7ffd#(?SZ3ciGO>Dh=!8xVwaP{szZhf!Xc#q z5Oza<5RTggfUs-&gK*F&0EFGuAEdf45O!gI5DsevNWyOI55j%=m>{F=!PdbPq!5r` z>r5{UB-lDL3IhqY4mzwt9w*p3vkC(VwhrcQ4oPs~?n|t;6Hf$N2XnhZK!UAPTNp^N zbB`D70~@IanW$z(E5C?`z{#WV|=NDAeo z<&R?dr9$AiKpiJH7h9pF*b0k_t$^j9>@%>n*?=y^<_}AjpqSNras|vT+!dscz3_LZ zkEvsur8Wqgle3E}`wkO>!$`ak5HlGJk|+d3?QT@NwgM#Q2zif_@6O9mA=k!|$s4?t zXpVl%lnwRMZ)2)amfNcEZ1}Q4n}eDyRK9y$-~t&*`?!bs_J{*SZS5;;^{_7U=9ALL zOmox*k?MK;@yGL8yOSncJJWzIl5FixTE5LzqO8+qNpG8Z#l_@j7J@E>P&v=b(^P)U zTIH&sx~g`_xLWBBR-;11e=fs%SDUJ{`O5AfUy?PpBZJ_c6>)IU@D{KvlI}Y5<~wGT z&sj}i*T`!V%%)0Z6VGSU5WvO?M1oB;-F5EFAH3O2R@o%-*~|-IV}&ZgrZnC4_B*e6 zvzel@Y0YQT6u^dqSypLf>8`VX{Hr&cYL(5hd^U>%*l-BUvN6(Kr~md3Z#Gj^Hp}zb z!~@t^K}{&FJl*x-UtaZQQ^RapgFUSdY+4KOw9)pbN6&h*nWnPw@U%GrY@D7p+WzL# z$Gq80SJ`-Y+WY`EPEQ+c|NYr#z1hrA*?4%`q5w8dPaAC?J#pHb%}kYzho_w%z{crm zqwU9Dd&`^6ER~IirzHZ|I6ZB&{ex3adb63$Y?cRmS`2KK7vO2}#5bRK#bUz&4atsa z=c#NwJZ)|O8>gqm6Q_^d<;|v6W#i##3j)|UJuRMi{*|NNZ0b}t9-h`5z{crm@x)`_ zebJjuOl9NYX-fjwI6W<%7&>{zn@zpS#>3NE1K2n{EuMJ)_&2=RjK`~M%v>&PWu zpaw1AVcCXM=a8||ZFy~$Swb5yaqF~u;3$v2uqeQzBR=PqgKQ+J#Lfh);kTuFItF@D zneM*cA>$@Dpk8BEkhR&Ug9fn8CxxL`H}zo%#v^9#dl{4m#*zWADhLLpgf~RE$rDcv zg`HQ1>#E#!HLOpGfkGk{8|c-&rUOJ!Y2sjMq0m31|&M`+Pti8*A%+@eHFz@!Z-xgMVsSP^mA=D|{w zU7(aL#s0-y*%)1ep5QvWPRn2fb>=B`BBp^l1K4TRFUGzdbkMG`c(4pL7ArN%%<*|O z+`Oh@ZwTIGTzp_(PpUVS>FYlj+nPR*88R+)L+Q)RO1hX1V7ndLzE{}44}u(o&bLCG z1Mml)rbz3vv9s<~j%ULJ{nEZZIO-}lt~S$*(4|mXpW>>#wji;5&Kl| zDi3fg{`r8L$sZi&F+77OoZyIoD)?j%a4Qb_fLlWYU4XYb!4V6UfKW2U1DxZbWwA1+ zT88y;v;OYG$KrmER`x;JjlhtzU zSp>wUzD#<^*yYC1$DzG2Mg~F=o=-+zUMGn%L>z}fZi;0kZ*Hm$ma;@vN;oSl2}(#66;DzVqI~-yl^lI-OY825RJkRTky>N z_Qao6#;&@GU3F`BrU%+u>aJBX3Fg@f@5sP#tgn1;*BD~8fauGx2PkU~B{KvBE8Mbv zL^zdeeS!5ZHQfK;{b!A5C^KH4x(5v~jNW6^oHLRzfMV4}=Pn{em32=khX{1s99tKv8h zrMpCWGszoP<@o{|<*npG$N7D{r4@)8IB8HJYat=h?~?L6Ln9brjO4^X4R>12osURY zLkbi`AI6EF(K3#qW9%T#n^~zKP9mmR#>>$@6uK2#uTUte8=}NoYSLZ-?gr4R;Ye$- zO!pMO%cFIC$fjFisasuA>e$vhiAtmxx)5l$F^zURib2UdS1!3I$gY<3lGB&q$e$mZ z@&ig+%5RM7Kc-Ek4zQ1mtspMf<%qC=j}@#kmh}l1_{3H?i}6%wh0oFo4zvDGpMHeS zu2ghIZZX-}UccqubMI~5Q~BVX1}&QZ#H-vZJ&Me4@%jEbWR9wrA#+sNi98wk05bO= zcOYYp4zpmQDab!W=74e*m3U7GM+WDQif}n5ePee=f8YMTu1xIW10CreD~dI(cxLnY zC$)8`KX~2l#xLL;W9=}78VEHIY9Q1=sDV%ep$0+?gc=An5NaUQK&XLG16d6O&HwdN zCr^zmo>e(~H|GD%?>)@(|L2i;uFvy-p7-<5ceWt&&(pRe^PGMsGSBz-AoHs6K4e}8 zkm~~X;asi*So8FWqCUvJ-?aedvb^?SEtKa@KD$Vcoc^=O@l=!b)c&=ZC%`!}4ER@IO-U zA4>4lybd{rT#wB6=iqxTaw9VT55Ron1;`7L`TmE<{2iuGR|8jKlOpfCjA7$B|I(8; zn-`O6C@S`Vu~h6wF@ujpC&ZC1vA*0#wxgtM1NVOXs9=|nW3MC^{EdHrwfVCQg;w02 z-FFwItHhQ~{xtzVJMX#f5;8F*_wx%s+y&szZHU`Xq5SzjVnpM?SClRSH~zWD_1Hkl Q7AQ`h6(8gI8yx)q1OB=L6aWAK literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py new file mode 100644 index 00000000..f0549358 --- /dev/null +++ b/pandas/io/tests/test_date_converters.py @@ -0,0 +1,99 @@ +from pandas.util.py3compat import StringIO, BytesIO +from datetime import date, datetime +import csv +import os +import sys +import re +import unittest + +import nose + +from numpy import nan +import numpy as np +from numpy.testing.decorators import slow + +from pandas import DataFrame, Series, Index, isnull +import pandas.io.parsers as parsers +from pandas.io.parsers import (read_csv, read_table, read_fwf, + ExcelFile, TextParser) +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal, network) +import pandas.lib as lib +from pandas.util import py3compat +from pandas.lib import Timestamp +import pandas.io.date_converters as conv + +class TestConverters(unittest.TestCase): + + def setUp(self): + self.years = np.array([2007, 2008]) + self.months = np.array([1, 2]) + self.days = np.array([3, 4]) + self.hours = np.array([5, 6]) + self.minutes = np.array([7, 8]) + self.seconds = np.array([9, 0]) + self.dates = np.array(['2007/1/3', '2008/2/4'], dtype=object) + self.times = np.array(['05:07:09', '06:08:00'], dtype=object) + self.expected = np.array([datetime(2007, 1, 3, 5, 7, 9), + datetime(2008, 2, 4, 6, 8, 0)]) + + def test_parse_date_time(self): + result = conv.parse_date_time(self.dates, self.times) + self.assert_((result == self.expected).all()) + + data = """\ +date, time, a, b +2001-01-05, 10:00:00, 0.0, 10. +2001-01-05, 00:00:00, 1., 11. +""" + datecols = {'date_time': [0, 1]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, date_parser=conv.parse_date_time) + self.assert_('date_time' in df) + self.assert_(df.date_time.ix[0] == datetime(2001, 1, 5, 10, 0, 0)) + + def test_parse_date_fields(self): + result = conv.parse_date_fields(self.years, self.months, self.days) + expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + self.assert_((result == expected).all()) + + data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." + datecols = {'ymd': [0, 1, 2]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_date_fields) + self.assert_('ymd' in df) + self.assert_(df.ymd.ix[0] == datetime(2001, 1, 10)) + + def test_datetime_six_col(self): + result = conv.parse_all_fields(self.years, self.months, self.days, + self.hours, self.minutes, self.seconds) + self.assert_((result == self.expected).all()) + + data = """\ +year, month, day, hour, minute, second, a, b +2001, 01, 05, 10, 00, 0, 0.0, 10. +2001, 01, 5, 10, 0, 00, 1., 11. +""" + datecols = {'ymdHMS': [0, 1, 2, 3, 4, 5]} + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=conv.parse_all_fields) + self.assert_('ymdHMS' in df) + self.assert_(df.ymdHMS.ix[0] == datetime(2001, 1, 5, 10, 0, 0)) + + def test_generic(self): + data = "year, month, day, a\n 2001, 01, 10, 10.\n 2001, 02, 1, 11." + datecols = {'ym': [0, 1]} + dateconverter = lambda y, m: date(year=int(y), month=int(m), day=1) + df = read_table(StringIO(data), sep=',', header=0, + parse_dates=datecols, + date_parser=dateconverter) + self.assert_('ym' in df) + self.assert_(df.ym.ix[0] == date(2001, 1, 1)) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py new file mode 100644 index 00000000..ba7d3c90 --- /dev/null +++ b/pandas/io/tests/test_parsers.py @@ -0,0 +1,1396 @@ +from pandas.util.py3compat import StringIO, BytesIO +from datetime import datetime +import csv +import os +import sys +import re +import unittest + +import nose + +from numpy import nan +import numpy as np + +from pandas import DataFrame, Series, Index, isnull +import pandas.io.parsers as parsers +from pandas.io.parsers import (read_csv, read_table, read_fwf, + ExcelFile, TextParser) +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, + assert_series_equal, network) +import pandas.lib as lib +from pandas.util import py3compat +from pandas.lib import Timestamp +from pandas.tseries.index import date_range + +from numpy.testing.decorators import slow +from pandas.io.date_converters import ( + parse_date_time, parse_date_fields, parse_all_fields +) + +def _skip_if_no_xlrd(): + try: + import xlrd + except ImportError: + raise nose.SkipTest('xlrd not installed, skipping') + +def _skip_if_no_openpyxl(): + try: + import openpyxl + except ImportError: + raise nose.SkipTest('openpyxl not installed, skipping') + + +class TestParsers(unittest.TestCase): + data1 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo2,12,13,14,15 +bar2,12,13,14,15 +""" + ts_data = """\ +ID,date,nominalTime,actualTime,A,B,C,D,E +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + + def setUp(self): + self.dirpath = curpath() + self.csv1 = os.path.join(self.dirpath, 'test1.csv') + self.csv2 = os.path.join(self.dirpath, 'test2.csv') + self.xls1 = os.path.join(self.dirpath, 'test.xls') + + def test_empty_string(self): + data = """\ +One,Two,Three +a,1,one +b,2,two +,3,three +d,4,nan +e,5,five +nan,6, +g,7,seven +""" + df = read_csv(StringIO(data)) + xp = DataFrame({'One' : ['a', 'b', np.nan, 'd', 'e', np.nan, 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', np.nan, 'five', + np.nan, 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + df = read_csv(StringIO(data), na_values={'One': [], 'Three': []}) + xp = DataFrame({'One' : ['a', 'b', '', 'd', 'e', 'nan', 'g'], + 'Two' : [1,2,3,4,5,6,7], + 'Three' : ['one', 'two', 'three', 'nan', 'five', + '', 'seven']}) + assert_frame_equal(xp.reindex(columns=df.columns), df) + + + def test_read_csv(self): + pass + + def test_dialect(self): + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + df = read_csv(StringIO(data), dialect=dia) + + data = '''\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +''' + exp = read_csv(StringIO(data)) + exp.replace('a', '"a', inplace=True) + assert_frame_equal(df, exp) + + def test_1000_sep(self): + data = """A|B|C +1|2,334.0|5 +10|13|10. +""" + expected = [[1, 2334., 5], + [10, 13, 10]] + + df = read_csv(StringIO(data), sep='|', thousands=',') + assert_almost_equal(df.values, expected) + + df = read_table(StringIO(data), sep='|', thousands=',') + assert_almost_equal(df.values, expected) + + def test_1000_fwf(self): + data = """ + 1 2,334.0 5 +10 13 10. +""" + expected = [[1, 2334., 5], + [10, 13, 10]] + df = read_fwf(StringIO(data), colspecs=[(0,3),(3,11),(12,16)], + thousands=',') + assert_almost_equal(df.values, expected) + + def test_comment(self): + data = """A,B,C +1,2.,4.#hello world +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = read_csv(StringIO(data), comment='#') + assert_almost_equal(df.values, expected) + + df = read_table(StringIO(data), sep=',', comment='#', na_values=['NaN']) + assert_almost_equal(df.values, expected) + + def test_comment_fwf(self): + data = """ + 1 2. 4 #hello world + 5 NaN 10.0 +""" + expected = [[1, 2., 4], + [5, np.nan, 10.]] + df = read_fwf(StringIO(data), colspecs=[(0,3),(4,9),(9,25)], + comment='#') + assert_almost_equal(df.values, expected) + + def test_squeeze(self): + data = """\ +a,1 +b,2 +c,3 +""" + expected = Series([1,2,3], ['a', 'b', 'c']) + result = read_table(StringIO(data), sep=',', index_col=0, + header=None, squeeze=True) + self.assert_(isinstance(result, Series)) + assert_series_equal(result, expected) + + def test_multiple_date_col(self): + # Can use multiple date parsers + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + def func(*date_cols): + return lib.try_parse_dates(parsers._concat_date_cols(date_cols)) + + df = read_csv(StringIO(data), header=None, + date_parser=func, + parse_dates={'nominal' : [1, 2], + 'actual' : [1,3]}) + self.assert_('nominal' in df) + self.assert_('actual' in df) + self.assert_('X.2' not in df) + self.assert_('X.3' not in df) + self.assert_('X.4' not in df) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.ix[0, 'nominal'] == d) + + df = read_csv(StringIO(data), header=None, + date_parser=func, + parse_dates={'nominal' : [1, 2], + 'actual' : [1,3]}, + keep_date_col=True) + self.assert_('nominal' in df) + self.assert_('actual' in df) + self.assert_('X.2' in df) + self.assert_('X.3' in df) + self.assert_('X.4' in df) + + data = """\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 +""" + df = read_csv(StringIO(data), header=None, + parse_dates=[[1, 2], [1,3]]) + self.assert_('X.2_X.3' in df) + self.assert_('X.2_X.4' in df) + self.assert_('X.2' not in df) + self.assert_('X.3' not in df) + self.assert_('X.4' not in df) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.ix[0, 'X.2_X.3'] == d) + + df = read_csv(StringIO(data), header=None, + parse_dates=[[1, 2], [1,3]], keep_date_col=True) + self.assert_('X.2_X.3' in df) + self.assert_('X.2_X.4' in df) + self.assert_('X.2' in df) + self.assert_('X.3' in df) + self.assert_('X.4' in df) + + data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' + df = read_csv(StringIO(data), sep=',', header=None, + parse_dates=[1], index_col=1) + from datetime import datetime + d = datetime(1999, 1, 27, 19, 0) + self.assert_(df.index[0] == d) + + def test_multiple_date_cols_with_header(self): + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + self.assert_(not isinstance(df.nominal[0], basestring)) + + def test_multiple_date_cols_index(self): + data = """\ +ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + xp = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}) + df = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, + index_col='nominal') + assert_frame_equal(xp.set_index('nominal'), df) + df2 = read_csv(StringIO(data), parse_dates={'nominal': [1, 2]}, + index_col=0) + assert_frame_equal(df2, df) + + df3 = read_csv(StringIO(data), parse_dates=[[1, 2]], index_col=0) + assert_frame_equal(df3, df) + + def test_multiple_date_cols_chunked(self): + df = read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1,2]}, + index_col='nominal') + reader = read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1,2]}, + index_col='nominal', chunksize=2) + + chunks = list(reader) + + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + def test_multiple_date_col_multiple_index(self): + df = read_csv(StringIO(self.ts_data), parse_dates={'nominal' : [1, 2]}, + index_col=['nominal', 'ID']) + xp = read_csv(StringIO(self.ts_data), parse_dates={'nominal' : [1, 2]}) + assert_frame_equal(xp.set_index(['nominal', 'ID']), df) + + def test_multiple_date_col_name_collision(self): + self.assertRaises(ValueError, read_csv, StringIO(self.ts_data), + parse_dates={'ID' : [1, 2]}) + + data = """\ +date_NominalTime,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + self.assertRaises(ValueError, read_csv, StringIO(data), + parse_dates=[[1, 2]]) + + def test_multiple_date_col_named_components(self): + xp = read_csv(StringIO(self.ts_data), parse_dates={'nominal': [1,2]}, + index_col='nominal') + colspec = {'nominal' : ['date', 'nominalTime']} + df = read_csv(StringIO(self.ts_data), parse_dates=colspec, + index_col='nominal') + assert_frame_equal(df, xp) + + def test_index_col_named(self): + no_header = """\ +KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" + + h = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" + data = h + no_header + #import pdb; pdb.set_trace() + rs = read_csv(StringIO(data), index_col='ID') + xp = read_csv(StringIO(data), header=0).set_index('ID') + assert_frame_equal(rs, xp) + + self.assertRaises(ValueError, read_csv, StringIO(no_header), + index_col='ID') + + data = """\ +1,2,3,4,hello +5,6,7,8,world +9,10,11,12,foo +""" + names = ['a', 'b', 'c', 'd', 'message'] + xp = DataFrame({'a' : [1, 5, 9], 'b' : [2, 6, 10], 'c' : [3, 7, 11], + 'd' : [4, 8, 12]}, + index=Index(['hello', 'world', 'foo'], name='message')) + rs = read_csv(StringIO(data), names=names, index_col=['message']) + assert_frame_equal(xp, rs) + self.assert_(xp.index.name == rs.index.name) + + rs = read_csv(StringIO(data), names=names, index_col='message') + assert_frame_equal(xp, rs) + self.assert_(xp.index.name == rs.index.name) + + def test_multiple_skts_example(self): + data = "year, month, a, b\n 2001, 01, 0.0, 10.\n 2001, 02, 1.1, 11." + pass + + def test_malformed(self): + # all + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +""" + + try: + df = read_table(StringIO(data), sep=',', header=1, comment='#') + self.assert_(False) + except ValueError, inst: + self.assert_('Expecting 3 columns, got 5 in row 3' in str(inst)) + + #skip_footer + data = """ignore +A,B,C +1,2,3 # comment +1,2,3,4,5 +2,3,4 +footer +""" + + try: + df = read_table(StringIO(data), sep=',', header=1, comment='#', + skip_footer=1) + self.assert_(False) + except ValueError, inst: + self.assert_('Expecting 3 columns, got 5 in row 3' in str(inst)) + + # first chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.get_chunk(5) + self.assert_(False) + except ValueError, inst: + self.assert_('Expecting 3 columns, got 5 in row 5' in str(inst)) + + + # middle chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.get_chunk(1) + it.get_chunk(2) + self.assert_(False) + except ValueError, inst: + self.assert_('Expecting 3 columns, got 5 in row 5' in str(inst)) + + + # last chunk + data = """ignore +A,B,C +skip +1,2,3 +3,5,10 # comment +1,2,3,4,5 +2,3,4 +""" + try: + it = read_table(StringIO(data), sep=',', + header=1, comment='#', iterator=True, chunksize=1, + skiprows=[2]) + df = it.get_chunk(1) + it.get_chunk() + self.assert_(False) + except ValueError, inst: + self.assert_('Expecting 3 columns, got 5 in row 5' in str(inst)) + + def test_quoting(self): + bad_line_small = """printer\tresult\tvariant_name +Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jacob +Klosterdruckerei\tKlosterdruckerei (1611-1804)\tMuller, Jakob +Klosterdruckerei\tKlosterdruckerei (1609-1805)\t"Furststiftische Hofdruckerei, (1609-1805)\tGaller, Alois +Klosterdruckerei\tKlosterdruckerei (1609-1805)\tHochfurstliche Buchhandlung """ + self.assertRaises(Exception, read_table, StringIO(bad_line_small), + sep='\t') + + good_line_small = bad_line_small + '"' + df = read_table(StringIO(good_line_small), sep='\t') + self.assert_(len(df) == 3) + + def test_custom_na_values(self): + data = """A,B,C +ignore,this,row +1,NA,3 +-1.#IND,5,baz +7,8,NaN +""" + expected = [[1., nan, 3], + [nan, 5, nan], + [7, 8, nan]] + + df = read_csv(StringIO(data), na_values=['baz'], skiprows=[1]) + assert_almost_equal(df.values, expected) + + df2 = read_table(StringIO(data), sep=',', na_values=['baz'], + skiprows=[1]) + assert_almost_equal(df2.values, expected) + + + def test_skiprows_bug(self): + # GH #505 + text = """#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +#foo,a,b,c +1/1/2000,1.,2.,3. +1/2/2000,4,5,6 +1/3/2000,7,8,9 +""" + data = read_csv(StringIO(text), skiprows=range(6), header=None, + index_col=0, parse_dates=True) + + data2 = read_csv(StringIO(text), skiprows=6, header=None, + index_col=0, parse_dates=True) + + expected = DataFrame(np.arange(1., 10.).reshape((3,3)), + columns=['X.2', 'X.3', 'X.4'], + index=[datetime(2000, 1, 1), datetime(2000, 1, 2), + datetime(2000, 1, 3)]) + assert_frame_equal(data, expected) + assert_frame_equal(data, data2) + + + def test_detect_string_na(self): + data = """A,B +foo,bar +NA,baz +NaN,nan +""" + expected = [['foo', 'bar'], + [nan, 'baz'], + [nan, nan]] + + df = read_csv(StringIO(data)) + assert_almost_equal(df.values, expected) + + def test_unnamed_columns(self): + data = """A,B,C,, +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + expected = [[1,2,3,4,5.], + [6,7,8,9,10], + [11,12,13,14,15]] + df = read_table(StringIO(data), sep=',') + assert_almost_equal(df.values, expected) + self.assert_(np.array_equal(df.columns, + ['A', 'B', 'C', 'Unnamed: 3', + 'Unnamed: 4'])) + + def test_string_nas(self): + data = """A,B,C +a,b,c +d,,f +,g,h +""" + result = read_csv(StringIO(data)) + expected = DataFrame([['a', 'b', 'c'], + ['d', np.nan, 'f'], + [np.nan, 'g', 'h']], + columns=['A', 'B', 'C']) + + assert_frame_equal(result, expected) + + def test_duplicate_columns(self): + data = """A,A,B,B,B +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df = read_table(StringIO(data), sep=',') + self.assert_(np.array_equal(df.columns, + ['A', 'A.1', 'B', 'B.1', 'B.2'])) + + def test_csv_mixed_type(self): + data = """A,B,C +a,1,2 +b,3,4 +c,4,5 +""" + df = read_csv(StringIO(data)) + # TODO + + def test_csv_custom_parser(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + df = read_csv(StringIO(data), + date_parser=lambda x: datetime.strptime(x, '%Y%m%d')) + expected = read_csv(StringIO(data), parse_dates=True) + assert_frame_equal(df, expected) + + def test_parse_dates_implicit_first_col(self): + data = """A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + df = read_csv(StringIO(data), parse_dates=True) + expected = read_csv(StringIO(data), index_col=0, parse_dates=True) + self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) + assert_frame_equal(df, expected) + + def test_parse_dates_string(self): + data = """date,A,B,C +20090101,a,1,2 +20090102,b,3,4 +20090103,c,4,5 +""" + rs = read_csv(StringIO(data), index_col='date', parse_dates='date') + idx = date_range('1/1/2009', periods=3).asobject + idx.name = 'date' + xp = DataFrame({'A': ['a', 'b', 'c'], + 'B': [1, 3, 4], + 'C': [2, 4, 5]}, idx) + assert_frame_equal(rs, xp) + + + def test_parse_dates_column_list(self): + from pandas.core.datetools import to_datetime + + data = '''date;destination;ventilationcode;unitcode;units;aux_date +01/01/2010;P;P;50;1;12/1/2011 +01/01/2010;P;R;50;1;13/1/2011 +15/01/2010;P;P;50;1;14/1/2011 +01/05/2010;P;P;50;1;15/1/2011''' + + expected = read_csv(StringIO(data), sep=";", index_col=range(4)) + + lev = expected.index.levels[0] + expected.index.levels[0] = lev.to_datetime(dayfirst=True) + expected['aux_date'] = to_datetime(expected['aux_date'], + dayfirst=True) + expected['aux_date'] = map(Timestamp, expected['aux_date']) + self.assert_(isinstance(expected['aux_date'][0], datetime)) + + df = read_csv(StringIO(data), sep=";", index_col = range(4), + parse_dates=[0, 5], dayfirst=True) + assert_frame_equal(df, expected) + + df = read_csv(StringIO(data), sep=";", index_col = range(4), + parse_dates=['date', 'aux_date'], dayfirst=True) + assert_frame_equal(df, expected) + + def test_no_header(self): + data = """1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + df = read_table(StringIO(data), sep=',', header=None) + names = ['foo', 'bar', 'baz', 'quux', 'panda'] + df2 = read_table(StringIO(data), sep=',', header=None, names=names) + expected = [[1,2,3,4,5.], + [6,7,8,9,10], + [11,12,13,14,15]] + assert_almost_equal(df.values, expected) + assert_almost_equal(df.values, df2.values) + self.assert_(np.array_equal(df.columns, + ['X.1', 'X.2', 'X.3', 'X.4', 'X.5'])) + self.assert_(np.array_equal(df2.columns, names)) + + def test_header_with_index_col(self): + data = """foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + names = ['A', 'B', 'C'] + df = read_csv(StringIO(data), names=names) + + self.assertEqual(names, ['A', 'B', 'C']) + + values = [[1,2,3],[4,5,6],[7,8,9]] + expected = DataFrame(values, index=['foo','bar','baz'], + columns=['A','B','C']) + assert_frame_equal(df, expected) + + def test_read_csv_dataframe(self): + df = read_csv(self.csv1, index_col=0, parse_dates=True) + df2 = read_table(self.csv1, sep=',', index_col=0, parse_dates=True) + self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D'])) + self.assert_(df.index.name == 'index') + self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) + self.assert_(df.values.dtype == np.float64) + assert_frame_equal(df, df2) + + def test_read_csv_no_index_name(self): + df = read_csv(self.csv2, index_col=0, parse_dates=True) + df2 = read_table(self.csv2, sep=',', index_col=0, parse_dates=True) + self.assert_(np.array_equal(df.columns, ['A', 'B', 'C', 'D', 'E'])) + self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp))) + self.assert_(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype == np.float64) + assert_frame_equal(df, df2) + + def test_excel_stop_iterator(self): + _skip_if_no_xlrd() + + excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) + parsed = excel_data.parse('Sheet1') + expected = DataFrame([['aaaa','bbbbb']], columns=['Test', 'Test1']) + assert_frame_equal(parsed, expected) + + def test_excel_cell_error_na(self): + _skip_if_no_xlrd() + + excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) + parsed = excel_data.parse('Sheet1') + expected = DataFrame([[np.nan]], columns=['Test']) + assert_frame_equal(parsed, expected) + + def test_excel_table(self): + _skip_if_no_xlrd() + + pth = os.path.join(self.dirpath, 'test.xls') + xls = ExcelFile(pth) + df = xls.parse('Sheet1', index_col=0, parse_dates=True) + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) + assert_frame_equal(df, df2) + assert_frame_equal(df3, df2) + + def test_excel_read_buffer(self): + _skip_if_no_xlrd() + _skip_if_no_openpyxl() + + pth = os.path.join(self.dirpath, 'test.xls') + f = open(pth, 'rb') + xls = ExcelFile(f) + # it works + xls.parse('Sheet1', index_col=0, parse_dates=True) + + pth = os.path.join(self.dirpath, 'test.xlsx') + f = open(pth, 'rb') + xl = ExcelFile(f) + df = xl.parse('Sheet1', index_col=0, parse_dates=True) + + def test_xlsx_table(self): + _skip_if_no_openpyxl() + + pth = os.path.join(self.dirpath, 'test.xlsx') + xlsx = ExcelFile(pth) + df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) + df2 = read_csv(self.csv1, index_col=0, parse_dates=True) + df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) + assert_frame_equal(df, df2) + assert_frame_equal(df3, df2) + + def test_read_table_wrong_num_columns(self): + data = """A,B,C,D,E,F +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 +""" + self.assertRaises(Exception, read_csv, StringIO(data)) + + def test_read_table_duplicate_index(self): + data = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + result = read_csv(StringIO(data), index_col=0) + expected = read_csv(StringIO(data)).set_index('index', + verify_integrity=False) + assert_frame_equal(result, expected) + + def test_read_table_duplicate_index_implicit(self): + data = """A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +qux,12,13,14,15 +foo,12,13,14,15 +bar,12,13,14,15 +""" + + # it works! + result = read_csv(StringIO(data)) + + def test_parse_bools(self): + data = """A,B +True,1 +False,2 +True,3 +""" + data = read_csv(StringIO(data)) + self.assert_(data['A'].dtype == np.bool_) + + def test_int_conversion(self): + data = """A,B +1.0,1 +2.0,2 +3.0,3 +""" + data = read_csv(StringIO(data)) + self.assert_(data['A'].dtype == np.float64) + self.assert_(data['B'].dtype == np.int64) + + def test_infer_index_col(self): + data = """A,B,C +foo,1,2,3 +bar,4,5,6 +baz,7,8,9 +""" + data = read_csv(StringIO(data)) + self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + + def test_sniff_delimiter(self): + text = """index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data = read_csv(StringIO(text), index_col=0, sep=None) + self.assert_(data.index.equals(Index(['foo', 'bar', 'baz']))) + + data2 = read_csv(StringIO(text), index_col=0, delimiter='|') + assert_frame_equal(data, data2) + + text = """ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""" + data3 = read_csv(StringIO(text), index_col=0, sep=None, skiprows=2) + assert_frame_equal(data, data3) + + # can't get this to work on Python 3 + if not py3compat.PY3: + text = u"""ignore this +ignore this too +index|A|B|C +foo|1|2|3 +bar|4|5|6 +baz|7|8|9 +""".encode('utf-8') + data4 = read_csv(BytesIO(text), index_col=0, sep=None, skiprows=2, + encoding='utf-8') + assert_frame_equal(data, data4) + + def test_read_nrows(self): + df = read_csv(StringIO(self.data1), nrows=3) + expected = read_csv(StringIO(self.data1))[:3] + assert_frame_equal(df, expected) + + def test_read_chunksize(self): + reader = read_csv(StringIO(self.data1), index_col=0, chunksize=2) + df = read_csv(StringIO(self.data1), index_col=0) + + chunks = list(reader) + + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + def test_read_chunksize_named(self): + reader = read_csv(StringIO(self.data1), index_col='index', chunksize=2) + df = read_csv(StringIO(self.data1), index_col='index') + + chunks = list(reader) + + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + def test_read_text_list(self): + data = """A,B,C\nfoo,1,2,3\nbar,4,5,6""" + as_list = [['A','B','C'],['foo','1','2','3'],['bar','4','5','6']] + df = read_csv(StringIO(data), index_col=0) + + parser = TextParser(as_list, index_col=0, chunksize=2) + chunk = parser.get_chunk(None) + + assert_frame_equal(chunk, df) + + def test_iterator(self): + reader = read_csv(StringIO(self.data1), index_col=0, iterator=True) + df = read_csv(StringIO(self.data1), index_col=0) + + chunk = reader.get_chunk(3) + assert_frame_equal(chunk, df[:3]) + + last_chunk = reader.get_chunk(5) + assert_frame_equal(last_chunk, df[3:]) + + # pass list + lines = list(csv.reader(StringIO(self.data1))) + parser = TextParser(lines, index_col=0, chunksize=2) + + df = read_csv(StringIO(self.data1), index_col=0) + + chunks = list(parser) + assert_frame_equal(chunks[0], df[:2]) + assert_frame_equal(chunks[1], df[2:4]) + assert_frame_equal(chunks[2], df[4:]) + + # pass skiprows + parser = TextParser(lines, index_col=0, chunksize=2, skiprows=[1]) + chunks = list(parser) + assert_frame_equal(chunks[0], df[1:3]) + + # test bad parameter (skip_footer) + reader = read_csv(StringIO(self.data1), index_col=0, iterator=True, + skip_footer=True) + self.assertRaises(ValueError, reader.get_chunk, 3) + + treader = read_table(StringIO(self.data1), sep=',', index_col=0, + iterator=True) + self.assert_(isinstance(treader, TextParser)) + + def test_header_not_first_line(self): + data = """got,to,ignore,this,line +got,to,ignore,this,line +index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + data2 = """index,A,B,C,D +foo,2,3,4,5 +bar,7,8,9,10 +baz,12,13,14,15 +""" + + df = read_csv(StringIO(data), header=2, index_col=0) + expected = read_csv(StringIO(data2), header=0, index_col=0) + assert_frame_equal(df, expected) + + def test_pass_names_with_index(self): + lines = self.data1.split('\n') + no_header = '\n'.join(lines[1:]) + + # regular index + names = ['index', 'A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=0, names=names) + expected = read_csv(StringIO(self.data1), index_col=0) + assert_frame_equal(df, expected) + + # multi index + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['index1', 'index2', 'A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) + expected = read_csv(StringIO(data), index_col=[0, 1]) + assert_frame_equal(df, expected) + + df = read_csv(StringIO(data), index_col=['index1', 'index2']) + assert_frame_equal(df, expected) + + def test_multi_index_no_level_names(self): + data = """index1,index2,A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + data2 = """A,B,C,D +foo,one,2,3,4,5 +foo,two,7,8,9,10 +foo,three,12,13,14,15 +bar,one,12,13,14,15 +bar,two,12,13,14,15 +""" + + lines = data.split('\n') + no_header = '\n'.join(lines[1:]) + names = ['A', 'B', 'C', 'D'] + df = read_csv(StringIO(no_header), index_col=[0, 1], names=names) + expected = read_csv(StringIO(data), index_col=[0, 1]) + assert_frame_equal(df, expected) + + # 2 implicit first cols + df2 = read_csv(StringIO(data2)) + assert_frame_equal(df2, df) + + def test_multi_index_parse_dates(self): + data = """index1,index2,A,B,C +20090101,one,a,1,2 +20090101,two,b,3,4 +20090101,three,c,4,5 +20090102,one,a,1,2 +20090102,two,b,3,4 +20090102,three,c,4,5 +20090103,one,a,1,2 +20090103,two,b,3,4 +20090103,three,c,4,5 +""" + df = read_csv(StringIO(data), index_col=[0, 1], parse_dates=True) + self.assert_(isinstance(df.index.levels[0][0], + (datetime, np.datetime64, Timestamp))) + + # specify columns out of order! + df2 = read_csv(StringIO(data), index_col=[1, 0], parse_dates=True) + self.assert_(isinstance(df2.index.levels[1][0], + (datetime, np.datetime64, Timestamp))) + + def test_skip_footer(self): + data = """A,B,C +1,2,3 +4,5,6 +7,8,9 +want to skip this +also also skip this +and this +""" + result = read_csv(StringIO(data), skip_footer=3) + no_footer = '\n'.join(data.split('\n')[:-4]) + expected = read_csv(StringIO(no_footer)) + + assert_frame_equal(result, expected) + + def test_no_unnamed_index(self): + data = """ id c0 c1 c2 +0 1 0 a b +1 2 0 c d +2 2 2 e f +""" + df = read_table(StringIO(data), sep=' ') + self.assert_(df.index.name is None) + + def test_converters(self): + data = """A,B,C,D +a,1,2,01/01/2009 +b,3,4,01/02/2009 +c,4,5,01/03/2009 +""" + from dateutil import parser + + result = read_csv(StringIO(data), converters={'D' : parser.parse}) + result2 = read_csv(StringIO(data), converters={3 : parser.parse}) + + expected = read_csv(StringIO(data)) + expected['D'] = expected['D'].map(parser.parse) + + self.assert_(isinstance(result['D'][0], (datetime, Timestamp))) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + # produce integer + converter = lambda x: int(x.split('/')[2]) + result = read_csv(StringIO(data), converters={'D' : converter}) + expected = read_csv(StringIO(data)) + expected['D'] = expected['D'].map(converter) + assert_frame_equal(result, expected) + + def test_converters_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + f = lambda x : float(x.replace(",", ".")) + converter = {'Number1':f,'Number2':f, 'Number3':f} + df2 = read_csv(StringIO(data), sep=';',converters=converter) + self.assert_(df2['Number1'].dtype == float) + self.assert_(df2['Number2'].dtype == float) + self.assert_(df2['Number3'].dtype == float) + + def test_converter_return_string_bug(self): + # GH #583 + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + f = lambda x : x.replace(",", ".") + converter = {'Number1':f,'Number2':f, 'Number3':f} + df2 = read_csv(StringIO(data), sep=';',converters=converter) + self.assert_(df2['Number1'].dtype == float) + + def test_regex_separator(self): + data = """ A B C D +a 1 2 3 4 +b 1 2 3 4 +c 1 2 3 4 +""" + df = read_table(StringIO(data), sep='\s+') + expected = read_csv(StringIO(re.sub('[ ]+', ',', data)), + index_col=0) + self.assert_(expected.index.name is None) + assert_frame_equal(df, expected) + + def test_verbose_import(self): + text = """a,b,c,d +one,1,2,3 +one,1,2,3 +,1,2,3 +one,1,2,3 +,1,2,3 +,1,2,3 +one,1,2,3 +two,1,2,3""" + + buf = StringIO() + sys.stdout = buf + + try: + # it works! + df = read_csv(StringIO(text), verbose=True) + self.assert_(buf.getvalue() == 'Filled 3 NA values in column a\n') + finally: + sys.stdout = sys.__stdout__ + + buf = StringIO() + sys.stdout = buf + + text = """a,b,c,d +one,1,2,3 +two,1,2,3 +three,1,2,3 +four,1,2,3 +five,1,2,3 +,1,2,3 +seven,1,2,3 +eight,1,2,3""" + + try: + # it works! + df = read_csv(StringIO(text), verbose=True, index_col=0) + self.assert_(buf.getvalue() == 'Found 1 NA values in the index\n') + finally: + sys.stdout = sys.__stdout__ + + def test_read_table_buglet_4x_multiindex(self): + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + # it works! + df = read_table(StringIO(text), sep='\s+') + self.assertEquals(df.index.names, ['one', 'two', 'three', 'four']) + + def test_read_csv_parse_simple_list(self): + text = """foo +bar baz +qux foo +foo +bar""" + df = read_csv(StringIO(text), header=None) + expected = DataFrame({'X.1' : ['foo', 'bar baz', 'qux foo', + 'foo', 'bar']}) + assert_frame_equal(df, expected) + + def test_parse_dates_custom_euroformat(self): + from dateutil.parser import parse + text = """foo,bar,baz +31/01/2010,1,2 +01/02/2010,1,NA +02/02/2010,1,2 +""" + parser = lambda d: parse(d, dayfirst=True) + df = read_csv(StringIO(text), skiprows=[0], + names=['time', 'Q', 'NTU'], index_col=0, + parse_dates=True, date_parser=parser, + na_values=['NA']) + + exp_index = Index([datetime(2010, 1, 31), datetime(2010, 2, 1), + datetime(2010, 2, 2)], name='time') + expected = DataFrame({'Q' : [1, 1, 1], 'NTU' : [2, np.nan, 2]}, + index=exp_index, columns=['Q', 'NTU']) + assert_frame_equal(df, expected) + + parser = lambda d: parse(d, day_first=True) + self.assertRaises(Exception, read_csv, + StringIO(text), skiprows=[0], + names=['time', 'Q', 'NTU'], index_col=0, + parse_dates=True, date_parser=parser, + na_values=['NA']) + + def test_converters_corner_with_nas(self): + import StringIO + import numpy as np + import pandas + csv = """id,score,days +1,2,12 +2,2-5, +3,,14+ +4,6-12,2""" + + def convert_days(x): + x = x.strip() + if not x: return np.nan + + is_plus = x.endswith('+') + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + return x + + def convert_days_sentinel(x): + x = x.strip() + if not x: return -1 + + is_plus = x.endswith('+') + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + return x + + def convert_score(x): + x = x.strip() + if not x: return np.nan + if x.find('-')>0: + valmin, valmax = map(int, x.split('-')) + val = 0.5*(valmin + valmax) + else: + val = float(x) + + return val + + fh = StringIO.StringIO(csv) + result = pandas.read_csv(fh, converters={'score':convert_score, + 'days':convert_days}, + na_values=[-1,'',None]) + self.assert_(isnull(result['days'][1])) + + fh = StringIO.StringIO(csv) + result2 = pandas.read_csv(fh, converters={'score':convert_score, + 'days':convert_days_sentinel}, + na_values=[-1,'',None]) + assert_frame_equal(result, result2) + + def test_fwf(self): + data_expected = """\ +2011,58,360.242940,149.910199,11950.7 +2011,59,444.953632,166.985655,11788.4 +2011,60,364.136849,183.628767,11806.2 +2011,61,413.836124,184.375703,11916.8 +2011,62,502.953953,173.237159,12468.3 +""" + expected = read_csv(StringIO(data_expected), header=None) + + data1 = """\ +201158 360.242940 149.910199 11950.7 +201159 444.953632 166.985655 11788.4 +201160 364.136849 183.628767 11806.2 +201161 413.836124 184.375703 11916.8 +201162 502.953953 173.237159 12468.3 +""" + colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)] + df = read_fwf(StringIO(data1), colspecs=colspecs, header=None) + assert_frame_equal(df, expected) + + data2 = """\ +2011 58 360.242940 149.910199 11950.7 +2011 59 444.953632 166.985655 11788.4 +2011 60 364.136849 183.628767 11806.2 +2011 61 413.836124 184.375703 11916.8 +2011 62 502.953953 173.237159 12468.3 +""" + df = read_fwf(StringIO(data2), widths=[5, 5, 13, 13, 7], header=None) + assert_frame_equal(df, expected) + + # From Thomas Kluyver: apparently some non-space filler characters can + # be seen, this is supported by specifying the 'delimiter' character: + # http://publib.boulder.ibm.com/infocenter/dmndhelp/v6r1mx/index.jsp?topic=/com.ibm.wbit.612.help.config.doc/topics/rfixwidth.html + data3 = """\ +201158~~~~360.242940~~~149.910199~~~11950.7 +201159~~~~444.953632~~~166.985655~~~11788.4 +201160~~~~364.136849~~~183.628767~~~11806.2 +201161~~~~413.836124~~~184.375703~~~11916.8 +201162~~~~502.953953~~~173.237159~~~12468.3 +""" + df = read_fwf(StringIO(data3), colspecs=colspecs, delimiter='~', header=None) + assert_frame_equal(df, expected) + + self.assertRaises(ValueError, read_fwf, StringIO(data3), + colspecs=colspecs, widths=[6, 10, 10, 7]) + def test_na_value_dict(self): + data = """A,B,C +foo,bar,NA +bar,foo,foo +foo,bar,NA +bar,foo,foo""" + + df = read_csv(StringIO(data), + na_values={'A': ['foo'], 'B': ['bar']}) + expected = DataFrame({'A': [np.nan, 'bar', np.nan, 'bar'], + 'B': [np.nan, 'foo', np.nan, 'foo'], + 'C': [np.nan, 'foo', np.nan, 'foo']}) + assert_frame_equal(df, expected) + + @slow + @network + def test_url(self): + # HTTP(S) + url = 'https://raw.github.com/pydata/pandas/master/pandas/io/tests/salary.table' + url_table = read_table(url) + dirpath = curpath() + localtable = os.path.join(dirpath, 'salary.table') + local_table = read_table(localtable) + assert_frame_equal(url_table, local_table) + #TODO: ftp testing + + @slow + def test_file(self): + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + dirpath = curpath() + localtable = os.path.join(dirpath, 'salary.table') + local_table = read_table(localtable) + + url_table = read_table('file://localhost/'+localtable) + assert_frame_equal(url_table, local_table) + + +class TestParseSQL(unittest.TestCase): + + def test_convert_sql_column_floats(self): + arr = np.array([1.5, None, 3, 4.2], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_strings(self): + arr = np.array(['1.5', None, '3', '4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array(['1.5', np.nan, '3', '4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_unicode(self): + arr = np.array([u'1.5', None, u'3', u'4.2'], dtype=object) + result = lib.convert_sql_column(arr) + expected = np.array([u'1.5', np.nan, u'3', u'4.2'], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_ints(self): + arr = np.array([1, 2, 3, 4], dtype='O') + arr2 = np.array([1, 2, 3, 4], dtype='i4').astype('O') + result = lib.convert_sql_column(arr) + result2 = lib.convert_sql_column(arr2) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + assert_same_values_and_dtype(result2, expected) + + arr = np.array([1, 2, 3, None, 4], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_longs(self): + arr = np.array([1L, 2L, 3L, 4L], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, 4], dtype='i8') + assert_same_values_and_dtype(result, expected) + + arr = np.array([1L, 2L, 3L, None, 4L], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([1, 2, 3, np.nan, 4], dtype='f8') + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_bools(self): + arr = np.array([True, False, True, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, True, False], dtype=bool) + assert_same_values_and_dtype(result, expected) + + arr = np.array([True, False, None, False], dtype='O') + result = lib.convert_sql_column(arr) + expected = np.array([True, False, np.nan, False], dtype=object) + assert_same_values_and_dtype(result, expected) + + def test_convert_sql_column_decimals(self): + from decimal import Decimal + arr = np.array([Decimal('1.5'), None, Decimal('3'), Decimal('4.2')]) + result = lib.convert_sql_column(arr) + expected = np.array([1.5, np.nan, 3, 4.2], dtype='f8') + assert_same_values_and_dtype(result, expected) + +def assert_same_values_and_dtype(res, exp): + assert(res.dtype == exp.dtype) + assert_almost_equal(res, exp) + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py new file mode 100644 index 00000000..87a6329e --- /dev/null +++ b/pandas/io/tests/test_pytables.py @@ -0,0 +1,685 @@ +from __future__ import with_statement + +import nose +import unittest +import os +import sys + +from datetime import datetime +import numpy as np + +from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, + date_range, Index) +from pandas.io.pytables import HDFStore, get_store +import pandas.util.testing as tm +from pandas.tests.test_series import assert_series_equal +from pandas.tests.test_frame import assert_frame_equal + +try: + import tables +except ImportError: + raise nose.SkipTest('no pytables') + +from distutils.version import LooseVersion + +_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ + and 'blosc' or 'zlib' + +class TestHDFStore(unittest.TestCase): + path = '__test__.h5' + scratchpath = '__scratch__.h5' + + def setUp(self): + self.store = HDFStore(self.path) + + def tearDown(self): + self.store.close() + os.remove(self.path) + + def test_factory_fun(self): + try: + with get_store(self.scratchpath) as tbl: + raise ValueError('blah') + except ValueError: + pass + + with get_store(self.scratchpath) as tbl: + tbl['a'] = tm.makeDataFrame() + + with get_store(self.scratchpath) as tbl: + self.assertEquals(len(tbl), 1) + self.assertEquals(type(tbl['a']), DataFrame) + + os.remove(self.scratchpath) + + def test_len_keys(self): + self.store['a'] = tm.makeTimeSeries() + self.store['b'] = tm.makeStringSeries() + self.store['c'] = tm.makeDataFrame() + self.store['d'] = tm.makePanel() + self.assertEquals(len(self.store), 4) + self.assert_(set(self.store.keys()) == set(['a', 'b', 'c', 'd'])) + + def test_repr(self): + repr(self.store) + self.store['a'] = tm.makeTimeSeries() + self.store['b'] = tm.makeStringSeries() + self.store['c'] = tm.makeDataFrame() + self.store['d'] = tm.makePanel() + repr(self.store) + + def test_contains(self): + self.store['a'] = tm.makeTimeSeries() + self.store['b'] = tm.makeDataFrame() + self.assert_('a' in self.store) + self.assert_('b' in self.store) + self.assert_('c' not in self.store) + + def test_reopen_handle(self): + self.store['a'] = tm.makeTimeSeries() + self.store.open('w', warn=False) + self.assert_(self.store.handle.isopen) + self.assertEquals(len(self.store), 0) + + def test_flush(self): + self.store['a'] = tm.makeTimeSeries() + self.store.flush() + + def test_get(self): + self.store['a'] = tm.makeTimeSeries() + left = self.store.get('a') + right = self.store['a'] + tm.assert_series_equal(left, right) + + self.assertRaises(AttributeError, self.store.get, 'b') + + def test_put(self): + ts = tm.makeTimeSeries() + df = tm.makeTimeDataFrame() + self.store['a'] = ts + self.store['b'] = df[:10] + self.store.put('c', df[:10], table=True) + + # not OK, not a table + self.assertRaises(ValueError, self.store.put, 'b', df[10:], append=True) + + # node does not currently exist, test _is_table_type returns False in + # this case + self.assertRaises(ValueError, self.store.put, 'f', df[10:], append=True) + + # OK + self.store.put('c', df[10:], append=True) + + # overwrite table + self.store.put('c', df[:10], table=True, append=False) + tm.assert_frame_equal(df[:10], self.store['c']) + + def test_put_compression(self): + df = tm.makeTimeDataFrame() + + self.store.put('c', df, table=True, compression='zlib') + tm.assert_frame_equal(self.store['c'], df) + + # can't compress if table=False + self.assertRaises(ValueError, self.store.put, 'b', df, + table=False, compression='zlib') + + def test_put_compression_blosc(self): + tm.skip_if_no_package('tables', '2.2', app='blosc support') + df = tm.makeTimeDataFrame() + + # can't compress if table=False + self.assertRaises(ValueError, self.store.put, 'b', df, + table=False, compression='blosc') + + self.store.put('c', df, table=True, compression='blosc') + tm.assert_frame_equal(self.store['c'], df) + + def test_put_integer(self): + # non-date, non-string index + df = DataFrame(np.random.randn(50, 100)) + self._check_roundtrip(df, tm.assert_frame_equal) + + def test_append(self): + df = tm.makeTimeDataFrame() + self.store.put('c', df[:10], table=True) + self.store.append('c', df[10:]) + tm.assert_frame_equal(self.store['c'], df) + + def test_append_diff_item_order(self): + wp = tm.makePanel() + wp1 = wp.ix[:, :10, :] + wp2 = wp.ix[['ItemC', 'ItemB', 'ItemA'], 10:, :] + + self.store.put('panel', wp1, table=True) + self.assertRaises(Exception, self.store.put, 'panel', wp2, + append=True) + + def test_remove(self): + ts = tm.makeTimeSeries() + df = tm.makeDataFrame() + self.store['a'] = ts + self.store['b'] = df + self.store.remove('a') + self.assertEquals(len(self.store), 1) + tm.assert_frame_equal(df, self.store['b']) + + self.store.remove('b') + self.assertEquals(len(self.store), 0) + + def test_remove_where_not_exist(self): + crit1 = { + 'field' : 'index', + 'op' : '>', + 'value' : 'foo' + } + self.store.remove('a', where=[crit1]) + + def test_remove_crit(self): + wp = tm.makePanel() + self.store.put('wp', wp, table=True) + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = { + 'field' : 'index', + 'op' : '>', + 'value' : date + } + crit2 = { + 'field' : 'column', + 'value' : ['A', 'D'] + } + self.store.remove('wp', where=[crit1]) + self.store.remove('wp', where=[crit2]) + result = self.store['wp'] + expected = wp.truncate(after=date).reindex(minor=['B', 'C']) + tm.assert_panel_equal(result, expected) + + def test_series(self): + s = tm.makeStringSeries() + self._check_roundtrip(s, tm.assert_series_equal) + + ts = tm.makeTimeSeries() + self._check_roundtrip(ts, tm.assert_series_equal) + + ts2 = Series(ts.index, Index(ts.index, dtype=object)) + self._check_roundtrip(ts2, tm.assert_series_equal) + + ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), + dtype=object)) + self._check_roundtrip(ts3, tm.assert_series_equal) + + def test_sparse_series(self): + s = tm.makeStringSeries() + s[3:5] = np.nan + ss = s.to_sparse() + self._check_roundtrip(ss, tm.assert_series_equal, + check_series_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_roundtrip(ss2, tm.assert_series_equal, + check_series_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_roundtrip(ss3, tm.assert_series_equal, + check_series_type=True) + + def test_sparse_frame(self): + s = tm.makeDataFrame() + s.ix[3:5, 1:3] = np.nan + s.ix[8:10, -2] = np.nan + ss = s.to_sparse() + self._check_double_roundtrip(ss, tm.assert_frame_equal, + check_frame_type=True) + + ss2 = s.to_sparse(kind='integer') + self._check_double_roundtrip(ss2, tm.assert_frame_equal, + check_frame_type=True) + + ss3 = s.to_sparse(fill_value=0) + self._check_double_roundtrip(ss3, tm.assert_frame_equal, + check_frame_type=True) + + def test_sparse_panel(self): + items = ['x', 'y', 'z'] + p = Panel(dict((i, tm.makeDataFrame()) for i in items)) + sp = p.to_sparse() + + self._check_double_roundtrip(sp, tm.assert_panel_equal, + check_panel_type=True) + + sp2 = p.to_sparse(kind='integer') + self._check_double_roundtrip(sp2, tm.assert_panel_equal, + check_panel_type=True) + + sp3 = p.to_sparse(fill_value=0) + self._check_double_roundtrip(sp3, tm.assert_panel_equal, + check_panel_type=True) + + def test_float_index(self): + # GH #454 + index = np.random.randn(10) + s = Series(np.random.randn(10), index=index) + self._check_roundtrip(s, tm.assert_series_equal) + + def test_tuple_index(self): + # GH #492 + col = np.arange(10) + idx = [(0.,1.), (2., 3.), (4., 5.)] + data = np.random.randn(30).reshape((3, 10)) + DF = DataFrame(data, index=idx, columns=col) + self._check_roundtrip(DF, tm.assert_frame_equal) + + def test_index_types(self): + values = np.random.randn(2) + + func = lambda l, r : tm.assert_series_equal(l, r, True, True, True) + + ser = Series(values, [0, 'y']) + self._check_roundtrip(ser, func) + + ser = Series(values, [datetime.today(), 0]) + self._check_roundtrip(ser, func) + + ser = Series(values, ['y', 0]) + self._check_roundtrip(ser, func) + + from datetime import date + ser = Series(values, [date.today(), 'a']) + self._check_roundtrip(ser, func) + + ser = Series(values, [1.23, 'b']) + self._check_roundtrip(ser, func) + + ser = Series(values, [1, 1.53]) + self._check_roundtrip(ser, func) + + ser = Series(values, [1, 5]) + self._check_roundtrip(ser, func) + + ser = Series(values, [datetime(2012, 1, 1), datetime(2012, 1, 2)]) + self._check_roundtrip(ser, func) + + def test_timeseries_preepoch(self): + if sys.version_info[0] == 2 and sys.version_info[1] < 7: + raise nose.SkipTest + + dr = bdate_range('1/1/1940', '1/1/1960') + ts = Series(np.random.randn(len(dr)), index=dr) + try: + self._check_roundtrip(ts, tm.assert_series_equal) + except OverflowError: + raise nose.SkipTest('known failer on some windows platforms') + + def test_frame(self): + df = tm.makeDataFrame() + + # put in some random NAs + df.values[0, 0] = np.nan + df.values[5, 3] = np.nan + + self._check_roundtrip_table(df, tm.assert_frame_equal) + self._check_roundtrip(df, tm.assert_frame_equal) + + self._check_roundtrip_table(df, tm.assert_frame_equal, + compression=True) + self._check_roundtrip(df, tm.assert_frame_equal, + compression=True) + + tdf = tm.makeTimeDataFrame() + self._check_roundtrip(tdf, tm.assert_frame_equal) + self._check_roundtrip(tdf, tm.assert_frame_equal, + compression=True) + + # not consolidated + df['foo'] = np.random.randn(len(df)) + self.store['df'] = df + recons = self.store['df'] + self.assert_(recons._data.is_consolidated()) + + # empty + self.assertRaises(ValueError, self._check_roundtrip, df[:0], + tm.assert_frame_equal) + + def test_can_serialize_dates(self): + rng = [x.date() for x in bdate_range('1/1/2000', '1/30/2000')] + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + self._check_roundtrip(frame, tm.assert_frame_equal) + + def test_timezones(self): + rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + frame = DataFrame(np.random.randn(len(rng), 4), index=rng) + try: + store = HDFStore(self.scratchpath) + store['frame'] = frame + recons = store['frame'] + self.assert_(recons.index.equals(rng)) + self.assertEquals(rng.tz, recons.index.tz) + finally: + store.close() + os.remove(self.scratchpath) + + def test_store_hierarchical(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + frame = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self._check_roundtrip(frame, tm.assert_frame_equal) + self._check_roundtrip(frame.T, tm.assert_frame_equal) + self._check_roundtrip(frame['A'], tm.assert_series_equal) + + # check that the names are stored + try: + store = HDFStore(self.scratchpath) + store['frame'] = frame + recons = store['frame'] + assert(recons.index.names == ['foo', 'bar']) + finally: + store.close() + os.remove(self.scratchpath) + + def test_store_index_name(self): + df = tm.makeDataFrame() + df.index.name = 'foo' + try: + store = HDFStore(self.scratchpath) + store['frame'] = df + recons = store['frame'] + assert(recons.index.name == 'foo') + finally: + store.close() + os.remove(self.scratchpath) + + def test_store_series_name(self): + df = tm.makeDataFrame() + series = df['A'] + + try: + store = HDFStore(self.scratchpath) + store['series'] = series + recons = store['series'] + assert(recons.name == 'A') + finally: + store.close() + os.remove(self.scratchpath) + + def test_store_mixed(self): + def _make_one(): + df = tm.makeDataFrame() + df['obj1'] = 'foo' + df['obj2'] = 'bar' + df['bool1'] = df['A'] > 0 + df['bool2'] = df['B'] > 0 + df['int1'] = 1 + df['int2'] = 2 + return df.consolidate() + + df1 = _make_one() + df2 = _make_one() + + self._check_roundtrip(df1, tm.assert_frame_equal) + self._check_roundtrip(df2, tm.assert_frame_equal) + + self.store['obj'] = df1 + tm.assert_frame_equal(self.store['obj'], df1) + self.store['obj'] = df2 + tm.assert_frame_equal(self.store['obj'], df2) + + # storing in Table not yet supported + self.assertRaises(Exception, self.store.put, 'foo', + df1, table=True) + + # check that can store Series of all of these types + self._check_roundtrip(df1['obj1'], tm.assert_series_equal) + self._check_roundtrip(df1['bool1'], tm.assert_series_equal) + self._check_roundtrip(df1['int1'], tm.assert_series_equal) + + # try with compression + self._check_roundtrip(df1['obj1'], tm.assert_series_equal, + compression=True) + self._check_roundtrip(df1['bool1'], tm.assert_series_equal, + compression=True) + self._check_roundtrip(df1['int1'], tm.assert_series_equal, + compression=True) + self._check_roundtrip(df1, tm.assert_frame_equal, + compression=True) + + def test_wide(self): + wp = tm.makePanel() + self._check_roundtrip(wp, tm.assert_panel_equal) + + def test_wide_table(self): + wp = tm.makePanel() + self._check_roundtrip_table(wp, tm.assert_panel_equal) + + def test_wide_table_dups(self): + wp = tm.makePanel() + try: + store = HDFStore(self.scratchpath) + store._quiet = True + store.put('panel', wp, table=True) + store.put('panel', wp, table=True, append=True) + recons = store['panel'] + tm.assert_panel_equal(recons, wp) + finally: + store.close() + os.remove(self.scratchpath) + + def test_long(self): + def _check(left, right): + tm.assert_panel_equal(left.to_panel(), right.to_panel()) + + wp = tm.makePanel() + self._check_roundtrip(wp.to_frame(), _check) + + # empty + self.assertRaises(ValueError, self._check_roundtrip, wp.to_frame()[:0], + _check) + + def test_longpanel(self): + pass + + def test_overwrite_node(self): + self.store['a'] = tm.makeTimeDataFrame() + ts = tm.makeTimeSeries() + self.store['a'] = ts + + tm.assert_series_equal(self.store['a'], ts) + + def test_panel_select(self): + wp = tm.makePanel() + self.store.put('wp', wp, table=True) + date = wp.major_axis[len(wp.major_axis) // 2] + + crit1 = { + 'field' : 'index', + 'op' : '>=', + 'value' : date + } + crit2 = { + 'field' : 'column', + 'value' : ['A', 'D'] + } + + result = self.store.select('wp', [crit1, crit2]) + expected = wp.truncate(before=date).reindex(minor=['A', 'D']) + tm.assert_panel_equal(result, expected) + + def test_frame_select(self): + df = tm.makeTimeDataFrame() + self.store.put('frame', df, table=True) + date = df.index[len(df) // 2] + + crit1 = { + 'field' : 'index', + 'op' : '>=', + 'value' : date + } + crit2 = { + 'field' : 'column', + 'value' : ['A', 'D'] + } + crit3 = { + 'field' : 'column', + 'value' : 'A' + } + + result = self.store.select('frame', [crit1, crit2]) + expected = df.ix[date:, ['A', 'D']] + tm.assert_frame_equal(result, expected) + + result = self.store.select('frame', [crit3]) + expected = df.ix[:, ['A']] + tm.assert_frame_equal(result, expected) + + # can't select if not written as table + self.store['frame'] = df + self.assertRaises(Exception, self.store.select, + 'frame', [crit1, crit2]) + + def test_select_filter_corner(self): + df = DataFrame(np.random.randn(50, 100)) + df.index = ['%.3d' % c for c in df.index] + df.columns = ['%.3d' % c for c in df.columns] + self.store.put('frame', df, table=True) + + crit = { + 'field' : 'column', + 'value' : df.columns[:75] + } + result = self.store.select('frame', [crit]) + tm.assert_frame_equal(result, df.ix[:, df.columns[:75]]) + + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): + options = {} + if compression: + options['complib'] = _default_compressor + + store = HDFStore(self.scratchpath, 'w', **options) + try: + store['obj'] = obj + retrieved = store['obj'] + comparator(retrieved, obj, **kwargs) + finally: + store.close() + os.remove(self.scratchpath) + + def _check_double_roundtrip(self, obj, comparator, compression=False, + **kwargs): + options = {} + if compression: + options['complib'] = _default_compressor + + store = HDFStore(self.scratchpath, 'w', **options) + try: + store['obj'] = obj + retrieved = store['obj'] + comparator(retrieved, obj, **kwargs) + store['obj'] = retrieved + again = store['obj'] + comparator(again, obj, **kwargs) + finally: + store.close() + os.remove(self.scratchpath) + + def _check_roundtrip_table(self, obj, comparator, compression=False): + options = {} + if compression: + options['complib'] = _default_compressor + + store = HDFStore(self.scratchpath, 'w', **options) + try: + store.put('obj', obj, table=True) + retrieved = store['obj'] + sorted_obj = _test_sort(obj) + comparator(retrieved, sorted_obj) + finally: + store.close() + os.remove(self.scratchpath) + + def test_legacy_read(self): + pth = curpath() + store = HDFStore(os.path.join(pth, 'legacy.h5'), 'r') + store['a'] + store['b'] + store['c'] + store['d'] + store.close() + + def test_store_datetime_fractional_secs(self): + dt = datetime(2012, 1, 2, 3, 4, 5, 123456) + series = Series([0], [dt]) + self.store['a'] = series + self.assertEquals(self.store['a'].index[0], dt) + + def test_tseries_indices_series(self): + idx = tm.makeDateIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + self.store['a'] = ser + result = self.store['a'] + + assert_series_equal(result, ser) + self.assertEquals(type(result.index), type(ser.index)) + self.assertEquals(result.index.freq, ser.index.freq) + + idx = tm.makePeriodIndex(10) + ser = Series(np.random.randn(len(idx)), idx) + self.store['a'] = ser + result = self.store['a'] + + assert_series_equal(result, ser) + self.assertEquals(type(result.index), type(ser.index)) + self.assertEquals(result.index.freq, ser.index.freq) + + def test_tseries_indices_frame(self): + idx = tm.makeDateIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + self.store['a'] = df + result = self.store['a'] + + assert_frame_equal(result, df) + self.assertEquals(type(result.index), type(df.index)) + self.assertEquals(result.index.freq, df.index.freq) + + idx = tm.makePeriodIndex(10) + df = DataFrame(np.random.randn(len(idx), 3), idx) + self.store['a'] = df + result = self.store['a'] + + assert_frame_equal(result, df) + self.assertEquals(type(result.index), type(df.index)) + self.assertEquals(result.index.freq, df.index.freq) + + def test_unicode_index(self): + unicode_values = [u'\u03c3', u'\u03c3\u03c3'] + + s = Series(np.random.randn(len(unicode_values)), unicode_values) + self._check_roundtrip(s, tm.assert_series_equal) + + def test_store_datetime_mixed(self): + df = DataFrame({'a': [1,2,3], 'b': [1.,2.,3.], 'c': ['a', 'b', 'c']}) + ts = tm.makeTimeSeries() + df['d'] = ts.index[:3] + self._check_roundtrip(df, tm.assert_frame_equal) + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + +def _test_sort(obj): + if isinstance(obj, DataFrame): + return obj.reindex(sorted(obj.index)) + elif isinstance(obj, Panel): + return obj.reindex(major=sorted(obj.major_axis)) + else: + raise ValueError('type not supported here') + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py new file mode 100644 index 00000000..0767bdcd --- /dev/null +++ b/pandas/io/tests/test_sql.py @@ -0,0 +1,169 @@ +from pandas.util.py3compat import StringIO +import unittest +import sqlite3 +import sys + +import numpy as np + +import pandas.io.sql as sql +import pandas.util.testing as tm +from pandas import Series, Index + +class TestSQLite(unittest.TestCase): + + def setUp(self): + self.db = sqlite3.connect(':memory:') + + def test_basic(self): + frame = tm.makeTimeDataFrame() + self._check_roundtrip(frame) + + def test_write_row_by_row(self): + frame = tm.makeTimeDataFrame() + frame.ix[0, 0] = np.nan + create_sql = sql.get_sqlite_schema(frame, 'test') + self.db.execute(create_sql) + + cur = self.db.cursor() + + ins = "INSERT INTO test VALUES (%s, %s, %s, %s)" + for idx, row in frame.iterrows(): + fmt_sql = sql.format_query(ins, *row) + sql.tquery(fmt_sql, cur=cur) + + self.db.commit() + + result = sql.read_frame("select * from test", con=self.db) + result.index = frame.index + tm.assert_frame_equal(result, frame) + + def test_execute(self): + frame = tm.makeTimeDataFrame() + create_sql = sql.get_sqlite_schema(frame, 'test') + self.db.execute(create_sql) + ins = "INSERT INTO test VALUES (?, ?, ?, ?)" + + row = frame.ix[0] + sql.execute(ins, self.db, params=tuple(row)) + self.db.commit() + + result = sql.read_frame("select * from test", self.db) + result.index = frame.index[:1] + tm.assert_frame_equal(result, frame[:1]) + + + + def test_execute_fail(self): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + self.db.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.db) + sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)', self.db) + + try: + sys.stdout = StringIO() + self.assertRaises(Exception, sql.execute, + 'INSERT INTO test VALUES("foo", "bar", 7)', + self.db) + finally: + sys.stdout = sys.__stdout__ + + def test_execute_closed_connection(self): + create_sql = """ + CREATE TABLE test + ( + a TEXT, + b TEXT, + c REAL, + PRIMARY KEY (a, b) + ); + """ + self.db.execute(create_sql) + + sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)', self.db) + self.db.close() + try: + sys.stdout = StringIO() + self.assertRaises(Exception, sql.tquery, "select * from test", + con=self.db) + finally: + sys.stdout = sys.__stdout__ + + def test_na_roundtrip(self): + pass + + def _check_roundtrip(self, frame): + sql.write_frame(frame, name='test_table', con=self.db) + result = sql.read_frame("select * from test_table", self.db) + + # HACK! + result.index = frame.index + + expected = frame + tm.assert_frame_equal(result, expected) + + frame['txt'] = ['a'] * len(frame) + frame2 = frame.copy() + frame2['Idx'] = Index(range(len(frame2))) + 10 + sql.write_frame(frame2, name='test_table2', con=self.db) + result = sql.read_frame("select * from test_table2", self.db, + index_col='Idx') + expected = frame.copy() + expected.index = Index(range(len(frame2))) + 10 + tm.assert_frame_equal(expected, result) + + + + def test_tquery(self): + frame = tm.makeTimeDataFrame() + sql.write_frame(frame, name='test_table', con=self.db) + result = sql.tquery("select A from test_table", self.db) + expected = frame.A + result = Series(result, frame.index) + tm.assert_series_equal(result, expected) + + try: + sys.stdout = StringIO() + self.assertRaises(sqlite3.OperationalError, sql.tquery, + 'select * from blah', con=self.db) + + self.assertRaises(sqlite3.OperationalError, sql.tquery, + 'select * from blah', con=self.db, retry=True) + finally: + sys.stdout = sys.__stdout__ + + def test_uquery(self): + frame = tm.makeTimeDataFrame() + sql.write_frame(frame, name='test_table', con=self.db) + stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' + self.assertEqual(sql.uquery(stmt, con=self.db), 1) + + try: + sys.stdout = StringIO() + + self.assertRaises(sqlite3.OperationalError, sql.tquery, + 'insert into blah values (1)', con=self.db) + + self.assertRaises(sqlite3.OperationalError, sql.tquery, + 'insert into blah values (1)', con=self.db, + retry=True) + finally: + sys.stdout = sys.__stdout__ + + +if __name__ == '__main__': + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--pdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/io/tests/test_yahoo.py b/pandas/io/tests/test_yahoo.py new file mode 100644 index 00000000..9f123e01 --- /dev/null +++ b/pandas/io/tests/test_yahoo.py @@ -0,0 +1,30 @@ +from pandas.util.py3compat import StringIO, BytesIO +from datetime import datetime +import csv +import os +import sys +import re +import unittest +import pandas.io.data as pd +import nose + +class TestYahoo(unittest.TestCase): + + def test_yahoo(self): + """asserts that yahoo is minimally working and that it throws + an excecption when DataReader can't get a 200 response from + yahoo """ + start = datetime(2010,1,1) + end = datetime(2012,1,24) + self.assertEquals( + pd.DataReader("F", 'yahoo', start, end)['Close'][-1], + 12.82) + + self.assertRaises( + Exception, + lambda: pd.DataReader("NON EXISTENT TICKER", 'yahoo', start, end)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/rpy/__init__.py b/pandas/rpy/__init__.py new file mode 100644 index 00000000..8d89a1bf --- /dev/null +++ b/pandas/rpy/__init__.py @@ -0,0 +1 @@ +# from pandas.rpy.common import importr, r, load_data diff --git a/pandas/rpy/base.py b/pandas/rpy/base.py new file mode 100644 index 00000000..070d457e --- /dev/null +++ b/pandas/rpy/base.py @@ -0,0 +1,13 @@ +import pandas.rpy.util as util + +class lm(object): + """ + Examples + -------- + >>> model = lm('x ~ y + z', data) + >>> model.coef + """ + def __init__(self, formula, data): + pass + + diff --git a/pandas/rpy/common.py b/pandas/rpy/common.py new file mode 100644 index 00000000..8667fb2f --- /dev/null +++ b/pandas/rpy/common.py @@ -0,0 +1,375 @@ +""" +Utilities for making working with rpy2 more user- and +developer-friendly. +""" + +import numpy as np + +import pandas as pn +import pandas.util.testing as _test + +from rpy2.robjects.packages import importr +from rpy2.robjects import r +import rpy2.robjects as robj + +__all__ = ['convert_robj', 'load_data', 'convert_to_r_dataframe', + 'convert_to_r_matrix'] + +def load_data(name, package=None, convert=True): + if package: + pack = importr(package) + + r.data(name) + + robj = r[name] + + if convert: + return convert_robj(robj) + else: + return robj + +def _rclass(obj): + """ + Return R class name for input object + """ + return r['class'](obj)[0] + +def _is_null(obj): + return _rclass(obj) == 'NULL' + +def _convert_list(obj): + """ + Convert named Vector to dict + """ + values = [convert_robj(x) for x in obj] + return dict(zip(obj.names, values)) + +def _convert_array(obj): + """ + Convert Array to ndarray + """ + # this royally sucks. "Matrices" (arrays) with dimension > 3 in R aren't + # really matrices-- things come out Fortran order in the first two + # dimensions. Maybe I'm wrong? + + dim = list(obj.dim) + values = np.array(list(obj)) + + if len(dim) == 3: + arr = values.reshape(dim[-1:] + dim[:-1]).swapaxes(1, 2) + + + if obj.names is not None: + name_list = [list(x) for x in obj.names] + if len(dim) == 2: + return pn.DataFrame(arr, index=name_list[0], columns=name_list[1]) + elif len(dim) == 3: + return pn.Panel(arr, items=name_list[2], + major_axis=name_list[0], + minor_axis=name_list[1]) + else: + print 'Cannot handle dim=%d' % len(dim) + else: + return arr + +def _convert_vector(obj): + if isinstance(obj, robj.IntVector): + return _convert_int_vector(obj) + elif isinstance(obj, robj.StrVector): + return _convert_str_vector(obj) + + return list(obj) + +NA_INTEGER = -2147483648 + +def _convert_int_vector(obj): + arr = np.asarray(obj) + mask = arr == NA_INTEGER + if mask.any(): + arr = arr.astype(float) + arr[mask] = np.nan + return arr + +def _convert_str_vector(obj): + arr = np.asarray(obj, dtype=object) + mask = arr == robj.NA_Character + if mask.any(): + arr[mask] = np.nan + return arr + +def _convert_DataFrame(rdf): + columns = list(rdf.colnames) + rows = np.array(rdf.rownames) + + data = {} + for i, col in enumerate(columns): + vec = rdf.rx2(i + 1) + values = _convert_vector(vec) + + if isinstance(vec, robj.FactorVector): + values = np.asarray(vec.levels).take(values - 1) + + data[col] = values + + return pn.DataFrame(data, index=_check_int(rows), columns=columns) + +def _convert_Matrix(mat): + columns = mat.colnames + rows = mat.rownames + + columns = None if _is_null(columns) else list(columns) + index = None if _is_null(rows) else list(rows) + + return pn.DataFrame(np.array(mat), index=_check_int(index), + columns=columns) + +def _check_int(vec): + try: + # R observation numbers come through as strings + vec = vec.astype(int) + except Exception: + pass + + return vec + +_pandas_converters = [ + (robj.DataFrame , _convert_DataFrame), + (robj.Matrix , _convert_Matrix), + (robj.StrVector, _convert_vector), + (robj.FloatVector, _convert_vector), + (robj.Array, _convert_array), + (robj.Vector, _convert_list), +] + +_converters = [ + (robj.DataFrame , lambda x: _convert_DataFrame(x).toRecords(index=False)), + (robj.Matrix , lambda x: _convert_Matrix(x).toRecords(index=False)), + (robj.IntVector, _convert_vector), + (robj.StrVector, _convert_vector), + (robj.FloatVector, _convert_vector), + (robj.Array, _convert_array), + (robj.Vector, _convert_list), +] + +def convert_robj(obj, use_pandas=True): + """ + Convert rpy2 object to a pandas-friendly form + + Parameters + ---------- + obj : rpy2 object + + Returns + ------- + Non-rpy data structure, mix of NumPy and pandas objects + """ + if not isinstance(obj, robj.RObjectMixin): + return obj + + converters = _pandas_converters if use_pandas else _converters + + for rpy_type, converter in converters: + if isinstance(obj, rpy_type): + return converter(obj) + + raise Exception('Do not know what to do with %s object' % type(obj)) + +VECTOR_TYPES = {np.float64: robj.FloatVector, + np.float32: robj.FloatVector, + np.float: robj.FloatVector, + np.int: robj.IntVector, + np.int32: robj.IntVector, + np.int64: robj.IntVector, + np.object_: robj.StrVector, + np.str: robj.StrVector, + np.bool: robj.BoolVector} + +NA_TYPES = {np.float64: robj.NA_Real, + np.float32: robj.NA_Real, + np.float: robj.NA_Real, + np.int: robj.NA_Integer, + np.int32: robj.NA_Integer, + np.int64: robj.NA_Integer, + np.object_: robj.NA_Character, + np.str: robj.NA_Character, + np.bool: robj.NA_Logical} + +def convert_to_r_dataframe(df, strings_as_factors=False): + """ + Convert a pandas DataFrame to a R data.frame. + + Parameters + ---------- + df: The DataFrame being converted + strings_as_factors: Whether to turn strings into R factors (default: False) + + Returns + ------- + A R data.frame + + """ + + import rpy2.rlike.container as rlc + + columns = rlc.OrdDict() + + #FIXME: This doesn't handle MultiIndex + + for column in df: + value = df[column] + value_type = value.dtype.type + value = [item if pn.notnull(item) else NA_TYPES[value_type] + for item in value] + + value = VECTOR_TYPES[value_type](value) + + if not strings_as_factors: + I = robj.baseenv.get("I") + value = I(value) + + columns[column] = value + + r_dataframe = robj.DataFrame(columns) + + del columns + + r_dataframe.rownames = robj.StrVector(df.index) + + return r_dataframe + + +def convert_to_r_matrix(df, strings_as_factors=False): + + """ + Convert a pandas DataFrame to a R matrix. + + Parameters + ---------- + df: The DataFrame being converted + strings_as_factors: Whether to turn strings into R factors (default: False) + + Returns + ------- + A R matrix + + """ + + if df._is_mixed_type: + raise TypeError("Conversion to matrix only possible with non-mixed " + "type DataFrames") + + + r_dataframe = convert_to_r_dataframe(df, strings_as_factors) + as_matrix = robj.baseenv.get("as.matrix") + r_matrix = as_matrix(r_dataframe) + + return r_matrix + + +def test_convert_list(): + obj = r('list(a=1, b=2, c=3)') + + converted = convert_robj(obj) + expected = {'a' : [1], 'b' : [2], 'c' : [3]} + + _test.assert_dict_equal(converted, expected) + +def test_convert_nested_list(): + obj = r('list(a=list(foo=1, bar=2))') + + converted = convert_robj(obj) + expected = {'a' : {'foo' : [1], 'bar' : [2]}} + + _test.assert_dict_equal(converted, expected) + +def test_convert_frame(): + # built-in dataset + df = r['faithful'] + + converted = convert_robj(df) + + assert np.array_equal(converted.columns, ['eruptions', 'waiting']) + assert np.array_equal(converted.index, np.arange(1, 273)) + +def _test_matrix(): + r('mat <- matrix(rnorm(9), ncol=3)') + r('colnames(mat) <- c("one", "two", "three")') + r('rownames(mat) <- c("a", "b", "c")') + + return r['mat'] + +def test_convert_matrix(): + mat = _test_matrix() + + converted = convert_robj(mat) + + assert np.array_equal(converted.index, ['a', 'b', 'c']) + assert np.array_equal(converted.columns, ['one', 'two', 'three']) + +def test_convert_r_dataframe(): + + is_na = robj.baseenv.get("is.na") + + seriesd = _test.getSeriesData() + frame = pn.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) + + #Null data + frame["E"] = [np.nan for item in frame["A"]] + # Some mixed type data + frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] + + r_dataframe = convert_to_r_dataframe(frame) + + assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) + assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) + assert all(is_na(item) for item in r_dataframe.rx2("E")) + + for column in frame[["A", "B", "C", "D"]]: + coldata = r_dataframe.rx2(column) + original_data = frame[column] + assert np.array_equal(convert_robj(coldata), original_data) + + for column in frame[["D", "E"]]: + for original, converted in zip(frame[column], + r_dataframe.rx2(column)): + + if pn.isnull(original): + assert is_na(converted) + else: + assert original == converted + +def test_convert_r_matrix(): + + is_na = robj.baseenv.get("is.na") + + seriesd = _test.getSeriesData() + frame = pn.DataFrame(seriesd, columns=['D', 'C', 'B', 'A']) + #Null data + frame["E"] = [np.nan for item in frame["A"]] + + r_dataframe = convert_to_r_matrix(frame) + + assert np.array_equal(convert_robj(r_dataframe.rownames), frame.index) + assert np.array_equal(convert_robj(r_dataframe.colnames), frame.columns) + assert all(is_na(item) for item in r_dataframe.rx(True, "E")) + + for column in frame[["A", "B", "C", "D"]]: + coldata = r_dataframe.rx(True, column) + original_data = frame[column] + assert np.array_equal(convert_robj(coldata), + original_data) + + # Pandas bug 1282 + frame["F"] = ["text" if item % 2 == 0 else np.nan for item in range(30)] + + #FIXME: Ugly, this whole module needs to be ported to nose/unittest + try: + wrong_matrix = convert_to_r_matrix(frame) + except TypeError: + pass + except Exception: + raise + + +if __name__ == '__main__': + pass diff --git a/pandas/rpy/mass.py b/pandas/rpy/mass.py new file mode 100644 index 00000000..1a663e57 --- /dev/null +++ b/pandas/rpy/mass.py @@ -0,0 +1,4 @@ + +class rlm(object): + pass + diff --git a/pandas/rpy/vars.py b/pandas/rpy/vars.py new file mode 100644 index 00000000..3993423b --- /dev/null +++ b/pandas/rpy/vars.py @@ -0,0 +1,20 @@ +import pandas.rpy.util as util + +class VAR(object): + """ + + Parameters + ---------- + y : + p : + type : {"const", "trend", "both", "none"} + season : + exogen : + lag_max : + ic : {"AIC", "HQ", "SC", "FPE"} + Information criterion to use, if lag_max is not None + """ + def __init__(y, p=1, type="none", season=None, exogen=None, + lag_max=None, ic=None): + pass + diff --git a/pandas/sandbox/__init__.py b/pandas/sandbox/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sandbox/qtpandas.py b/pandas/sandbox/qtpandas.py new file mode 100644 index 00000000..50d183db --- /dev/null +++ b/pandas/sandbox/qtpandas.py @@ -0,0 +1,127 @@ +''' +Easy integration of DataFrame into pyqt framework + +@author: Jev Kuznetsov +''' +from PyQt4.QtCore import (QAbstractTableModel,Qt,QVariant,QModelIndex, SIGNAL) +from PyQt4.QtGui import (QApplication,QDialog,QVBoxLayout, QTableView, QWidget) + +from pandas import DataFrame, Index + + + +class DataFrameModel(QAbstractTableModel): + ''' data model for a DataFrame class ''' + def __init__(self): + super(DataFrameModel,self).__init__() + self.df = DataFrame() + + def setDataFrame(self,dataFrame): + self.df = dataFrame + + def signalUpdate(self): + ''' tell viewers to update their data (this is full update, not + efficient)''' + self.layoutChanged.emit() + + #------------- table display functions ----------------- + def headerData(self,section,orientation,role=Qt.DisplayRole): + if role != Qt.DisplayRole: + return QVariant() + + if orientation == Qt.Horizontal: + try: + return self.df.columns.tolist()[section] + except (IndexError, ): + return QVariant() + elif orientation == Qt.Vertical: + try: + #return self.df.index.tolist() + return self.df.index.tolist()[section] + except (IndexError, ): + return QVariant() + + def data(self, index, role=Qt.DisplayRole): + if role != Qt.DisplayRole: + return QVariant() + + if not index.isValid(): + return QVariant() + + return QVariant(str(self.df.ix[index.row(),index.column()])) + + def flags(self, index): + flags = super(DataFrameModel, self).flags(index) + flags |= Qt.ItemIsEditable + return flags + + def setData(self, index, value, role): + self.df.set_value(self.df.index[index.row()], + self.df.columns[index.column()], + value.toPyObject()) + return True + + def rowCount(self, index=QModelIndex()): + return self.df.shape[0] + + def columnCount(self, index=QModelIndex()): + return self.df.shape[1] + + +class DataFrameWidget(QWidget): + ''' a simple widget for using DataFrames in a gui ''' + def __init__(self,dataFrame, parent=None): + super(DataFrameWidget,self).__init__(parent) + + self.dataModel = DataFrameModel() + self.dataModel.setDataFrame(dataFrame) + + self.dataTable = QTableView() + self.dataTable.setModel(self.dataModel) + self.dataModel.signalUpdate() + + layout = QVBoxLayout() + layout.addWidget(self.dataTable) + self.setLayout(layout) + + + + def resizeColumnsToContents(self): + self.dataTable.resizeColumnsToContents() + +#-----------------stand alone test code + +def testDf(): + ''' creates test dataframe ''' + data = {'int':[1,2,3], 'float':[1.5,2.5,3.5], + 'string':['a','b','c'], 'nan':[np.nan,np.nan,np.nan]} + return DataFrame(data, index=Index(['AAA','BBB','CCC']), + columns=['int','float','string','nan']) + + +class Form(QDialog): + def __init__(self,parent=None): + super(Form,self).__init__(parent) + + df = testDf() # make up some data + widget = DataFrameWidget(df) + widget.resizeColumnsToContents() + + layout = QVBoxLayout() + layout.addWidget(widget) + self.setLayout(layout) + +if __name__=='__main__': + import sys + import numpy as np + + app = QApplication(sys.argv) + form = Form() + form.show() + app.exec_() + + + + + + diff --git a/pandas/sandbox/stats/__init__.py b/pandas/sandbox/stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sandbox/stats/rls.py b/pandas/sandbox/stats/rls.py new file mode 100644 index 00000000..b873225c --- /dev/null +++ b/pandas/sandbox/stats/rls.py @@ -0,0 +1,137 @@ +"""Restricted least squares""" + +import numpy as np +from scikits.statsmodels.regression import WLS, GLS, RegressionResults + +class RLS(GLS): + """ + Restricted general least squares model that handles linear constraints + + Parameters + ---------- + endog: array-like + n length array containing the dependent variable + exog: array-like + n-by-p array of independent variables + constr: array-like + k-by-p array of linear constraints + param (0.): array-like or scalar + p-by-1 array (or scalar) of constraint parameters + sigma (None): scalar or array-like + The weighting matrix of the covariance. No scaling by default (OLS). + If sigma is a scalar, then it is converted into an n-by-n diagonal + matrix with sigma as each diagonal element. + If sigma is an n-length array, then it is assumed to be a diagonal + matrix with the given sigma on the diagonal (WLS). + + Notes + ----- + endog = exog * beta + epsilon + weights' * constr * beta = param + + See Greene and Seaks, "The Restricted Least Squares Estimator: + A Pedagogical Note", The Review of Economics and Statistics, 1991. + """ + + def __init__(self, endog, exog, constr, param=0., sigma=None): + N, Q = exog.shape + if constr.ndim == 1: + K, P = 1, constr.shape[0] + else: + K, P = constr.shape + if Q != P: + raise Exception('Constraints and design do not align') + self.ncoeffs = Q + self.nconstraint = K + self.constraint = constr + if np.isscalar(param) and K > 1: + param = np.ones((K,)) * param + self.param = param + if sigma is None: + sigma = 1. + if np.isscalar(sigma): + sigma = np.ones(N) * sigma + sigma = np.squeeze(sigma) + if sigma.ndim == 1: + self.sigma = np.diag(sigma) + self.cholsigmainv = np.diag(np.sqrt(sigma)) + else: + self.sigma = sigma + self.cholsigmainv = np.linalg.cholesky(np.linalg.pinv(self.sigma)).T + super(GLS, self).__init__(endog, exog) + + _rwexog = None + @property + def rwexog(self): + """Whitened exogenous variables augmented with restrictions""" + if self._rwexog is None: + P = self.ncoeffs + K = self.nconstraint + design = np.zeros((P + K, P + K)) + design[:P, :P] = np.dot(self.wexog.T, self.wexog) #top left + constr = np.reshape(self.constraint, (K, P)) + design[:P, P:] = constr.T #top right partition + design[P:, :P] = constr #bottom left partition + design[P:, P:] = np.zeros((K, K)) #bottom right partition + self._rwexog = design + return self._rwexog + + _inv_rwexog = None + @property + def inv_rwexog(self): + """Inverse of self.rwexog""" + if self._inv_rwexog is None: + self._inv_rwexog = np.linalg.inv(self.rwexog) + return self._inv_rwexog + + _rwendog = None + @property + def rwendog(self): + """Whitened endogenous variable augmented with restriction parameters""" + if self._rwendog is None: + P = self.ncoeffs + K = self.nconstraint + response = np.zeros((P + K,)) + response[:P] = np.dot(self.wexog.T, self.wendog) + response[P:] = self.param + self._rwendog = response + return self._rwendog + + _ncp = None + @property + def rnorm_cov_params(self): + """Parameter covariance under restrictions""" + if self._ncp is None: + P = self.ncoeffs + self._ncp = self.inv_rwexog[:P, :P] + return self._ncp + + _wncp = None + @property + def wrnorm_cov_params(self): + """ + Heteroskedasticity-consistent parameter covariance + Used to calculate White standard errors. + """ + if self._wncp is None: + df = self.df_resid + pred = np.dot(self.wexog, self.coeffs) + eps = np.diag((self.wendog - pred) ** 2) + sigmaSq = np.sum(eps) + pinvX = np.dot(self.rnorm_cov_params, self.wexog.T) + self._wncp = np.dot(np.dot(pinvX, eps), pinvX.T) * df / sigmaSq + return self._wncp + + _coeffs = None + @property + def coeffs(self): + """Estimated parameters""" + if self._coeffs is None: + betaLambda = np.dot(self.inv_rwexog, self.rwendog) + self._coeffs = betaLambda[:self.ncoeffs] + return self._coeffs + + def fit(self): + rncp = self.wrnorm_cov_params + lfit = RegressionResults(self, self.coeffs, normalized_cov_params=rncp) + return lfit diff --git a/pandas/setup.py b/pandas/setup.py new file mode 100644 index 00000000..f9945f0f --- /dev/null +++ b/pandas/setup.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python + +import numpy + +def configuration(parent_package='',top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration('pandas', parent_package, top_path) + config.add_subpackage('core') + config.add_subpackage('io') + config.add_subpackage('rpy') + config.add_subpackage('sandbox') + config.add_subpackage('stats') + config.add_subpackage('util') + config.add_data_dir('tests') + + config.add_extension('_tseries', + sources=['src/tseries.c'], + include_dirs=[numpy.get_include()]) + config.add_extension('_sparse', + sources=['src/sparse.c'], + include_dirs=[numpy.get_include()]) + return config + +if __name__ == '__main__': + print('This is the wrong setup.py file to run') + diff --git a/pandas/sparse/__init__.py b/pandas/sparse/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sparse/api.py b/pandas/sparse/api.py new file mode 100644 index 00000000..230ad159 --- /dev/null +++ b/pandas/sparse/api.py @@ -0,0 +1,7 @@ +# pylint: disable=W0611 + +from pandas.sparse.array import SparseArray +from pandas.sparse.list import SparseList +from pandas.sparse.series import SparseSeries, SparseTimeSeries +from pandas.sparse.frame import SparseDataFrame +from pandas.sparse.panel import SparsePanel diff --git a/pandas/sparse/array.py b/pandas/sparse/array.py new file mode 100644 index 00000000..f0f02d31 --- /dev/null +++ b/pandas/sparse/array.py @@ -0,0 +1,435 @@ +""" +SparseArray data structure +""" + +# pylint: disable=E1101,E1103,W0231 + +from numpy import nan, ndarray +import numpy as np + +import operator +import pandas.core.common as com + +from pandas.util import py3compat + +from pandas._sparse import BlockIndex, IntIndex +import pandas._sparse as splib +import pandas.lib as lib + + +def _sparse_op_wrap(op, name): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def wrapper(self, other): + if isinstance(other, np.ndarray): + assert(len(self) == len(other)) + if not isinstance(other, SparseArray): + other = SparseArray(other, fill_value=self.fill_value) + return _sparse_array_op(self, other, op, name) + elif np.isscalar(other): + new_fill_value = op(np.float64(self.fill_value), + np.float64(other)) + + return SparseArray(op(self.sp_values, other), + sparse_index=self.sp_index, + fill_value=new_fill_value) + else: # pragma: no cover + raise TypeError('operation with %s not supported' % type(other)) + + wrapper.__name__ = name + return wrapper + +def _sparse_array_op(left, right, op, name): + if np.isnan(left.fill_value): + sparse_op = lambda a, b: _sparse_nanop(a, b, name) + else: + sparse_op = lambda a, b: _sparse_fillop(a, b, name) + + if left.sp_index.equals(right.sp_index): + result = op(left.sp_values, right.sp_values) + result_index = left.sp_index + else: + result, result_index = sparse_op(left, right) + + try: + fill_value = op(left.fill_value, right.fill_value) + except ZeroDivisionError: + fill_value = nan + + return SparseArray(result, sparse_index=result_index, + fill_value=fill_value) + +def _sparse_nanop(this, other, name): + sparse_op = getattr(splib, 'sparse_nan%s' % name) + result, result_index = sparse_op(this.sp_values, + this.sp_index, + other.sp_values, + other.sp_index) + + return result, result_index + +def _sparse_fillop(this, other, name): + sparse_op = getattr(splib, 'sparse_%s' % name) + result, result_index = sparse_op(this.sp_values, + this.sp_index, + this.fill_value, + other.sp_values, + other.sp_index, + other.fill_value) + + return result, result_index + + +class SparseArray(np.ndarray): + """Data structure for labeled, sparse floating point data + +Parameters +---------- +data : {array-like, Series, SparseSeries, dict} +kind : {'block', 'integer'} +fill_value : float + Defaults to NaN (code for missing) +sparse_index : {BlockIndex, IntIndex}, optional + Only if you have one. Mainly used internally + +Notes +----- +SparseSeries objects are immutable via the typical Python means. If you +must change values, convert to dense, make your changes, then convert back +to sparse + """ + __array_priority__ = 15 + + sp_index = None + fill_value = None + + def __new__(cls, data, sparse_index=None, kind='integer', fill_value=None, + copy=False): + + is_sparse_array = isinstance(data, SparseArray) + if fill_value is None: + if is_sparse_array: + fill_value = data.fill_value + else: + fill_value = nan + + if is_sparse_array: + sparse_index = data.sp_index + values = np.asarray(data) + else: + # array-like + if sparse_index is None: + values, sparse_index = make_sparse(data, kind=kind, + fill_value=fill_value) + else: + values = data + assert(len(values) == sparse_index.npoints) + + # Create array, do *not* copy data by default + if copy: + subarr = np.array(values, dtype=np.float64, copy=True) + else: + subarr = np.asarray(values, dtype=np.float64) + + # Change the class of the array to be the subclass type. + output = subarr.view(cls) + output.sp_index = sparse_index + output.fill_value = np.float64(fill_value) + return output + + @property + def _constructor(self): + return lambda x: SparseArray(x, fill_value=self.fill_value, + kind=self.kind) + + @property + def kind(self): + if isinstance(self.sp_index, BlockIndex): + return 'block' + elif isinstance(self.sp_index, IntIndex): + return 'integer' + + def __array_finalize__(self, obj): + """ + Gets called after any ufunc or other array operations, necessary + to pass on the index. + """ + self.sp_index = getattr(obj, 'sp_index', None) + self.fill_value = getattr(obj, 'fill_value', None) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(ndarray.__reduce__(self)) + subclass_state = self.fill_value, self.sp_index + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + nd_state, own_state = state + ndarray.__setstate__(self, nd_state) + + fill_value, sp_index = own_state[:2] + self.sp_index = sp_index + self.fill_value = fill_value + + def __len__(self): + return self.sp_index.length + + def __repr__(self): + return '%s\n%s' % (np.ndarray.__repr__(self), + repr(self.sp_index)) + + # Arithmetic operators + + __add__ = _sparse_op_wrap(operator.add, 'add') + __sub__ = _sparse_op_wrap(operator.sub, 'sub') + __mul__ = _sparse_op_wrap(operator.mul, 'mul') + __truediv__ = _sparse_op_wrap(operator.truediv, 'truediv') + __floordiv__ = _sparse_op_wrap(operator.floordiv, 'floordiv') + __pow__ = _sparse_op_wrap(operator.pow, 'pow') + + # reverse operators + __radd__ = _sparse_op_wrap(operator.add, 'add') + __rsub__ = _sparse_op_wrap(lambda x, y: y - x, 'rsub') + __rmul__ = _sparse_op_wrap(operator.mul, 'mul') + __rtruediv__ = _sparse_op_wrap(lambda x, y: y / x, 'rtruediv') + __rfloordiv__ = _sparse_op_wrap(lambda x, y: y // x, 'rfloordiv') + __rpow__ = _sparse_op_wrap(lambda x, y: y ** x, 'rpow') + + def disable(self, other): + raise NotImplementedError('inplace binary ops not supported') + # Inplace operators + __iadd__ = disable + __isub__ = disable + __imul__ = disable + __itruediv__ = disable + __ifloordiv__ = disable + __ipow__ = disable + + # Python 2 division operators + if not py3compat.PY3: + __div__ = _sparse_op_wrap(operator.div, 'div') + __rdiv__ = _sparse_op_wrap(lambda x, y: y / x, '__rdiv__') + __idiv__ = disable + + @property + def values(self): + """ + Dense values + """ + output = np.empty(len(self), dtype=np.float64) + int_index = self.sp_index.to_int_index() + output.fill(self.fill_value) + output.put(int_index.indices, self) + return output + + @property + def sp_values(self): + # caching not an option, leaks memory + return self.view(np.ndarray) + + def __getitem__(self, key): + """ + + """ + if com.is_integer(key): + return self._get_val_at(key) + else: + data_slice = self.values[key] + return self._constructor(data_slice) + + def __getslice__(self, i, j): + if i < 0: + i = 0 + if j < 0: + j = 0 + slobj = slice(i, j) + return self.__getitem__(slobj) + + def _get_val_at(self, loc): + n = len(self) + if loc < 0: + loc += n + + if loc >= len(self) or loc < 0: + raise Exception('Out of bounds access') + + sp_loc = self.sp_index.lookup(loc) + if sp_loc == -1: + return self.fill_value + else: + return lib.get_value_at(self, sp_loc) + + def take(self, indices, axis=0): + """ + Sparse-compatible version of ndarray.take + + Returns + ------- + taken : ndarray + """ + assert(axis == 0) + indices = np.asarray(indices, dtype=int) + + n = len(self) + if (indices < 0).any() or (indices >= n).any(): + raise Exception('out of bounds access') + + if self.sp_index.npoints > 0: + locs = np.array([self.sp_index.lookup(loc) for loc in indices]) + result = self.sp_values.take(locs) + result[locs == -1] = self.fill_value + else: + result = np.empty(len(indices)) + result.fill(self.fill_value) + + return result + + def __setitem__(self, key, value): + raise Exception('SparseArray objects are immutable') + + def __setslice__(self, i, j, value): + raise Exception('SparseArray objects are immutable') + + def to_dense(self): + """ + Convert SparseSeries to (dense) Series + """ + return self.values + + def astype(self, dtype=None): + """ + + """ + dtype = np.dtype(dtype) + if dtype is not None and dtype not in (np.float_, float): + raise Exception('Can only support floating point data for now') + return self.copy() + + def copy(self, deep=True): + """ + Make a copy of the SparseSeries. Only the actual sparse values need to + be copied + """ + if deep: + values = self.sp_values.copy() + else: + values = self.sp_values + return SparseArray(values, sparse_index=self.sp_index, + fill_value=self.fill_value) + + def count(self): + """ + Compute sum of non-NA/null observations in SparseSeries. If the + fill_value is not NaN, the "sparse" locations will be included in the + observation count + + Returns + ------- + nobs : int + """ + sp_values = self.sp_values + valid_spvals = np.isfinite(sp_values).sum() + if self._null_fill_value: + return valid_spvals + else: + return valid_spvals + self.sp_index.ngaps + + @property + def _null_fill_value(self): + return np.isnan(self.fill_value) + + @property + def _valid_sp_values(self): + sp_vals = self.sp_values + mask = np.isfinite(sp_vals) + return sp_vals[mask] + + def sum(self, axis=None, dtype=None, out=None): + """ + Sum of non-NA/null values + + Returns + ------- + sum : float + """ + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + if self._null_fill_value: + return sp_sum + else: + nsparse = self.sp_index.ngaps + return sp_sum + self.fill_value * nsparse + + def cumsum(self, axis=0, dtype=None, out=None): + """ + Cumulative sum of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Returns + ------- + cumsum : Series + """ + if com.notnull(self.fill_value): + return self.to_dense().cumsum() + # TODO: what if sp_values contains NaN?? + return SparseArray(self.sp_values.cumsum(), + sparse_index=self.sp_index, + fill_value=self.fill_value) + + def mean(self, axis=None, dtype=None, out=None): + """ + Mean of non-NA/null values + + Returns + ------- + mean : float + """ + valid_vals = self._valid_sp_values + sp_sum = valid_vals.sum() + ct = len(valid_vals) + + if self._null_fill_value: + return sp_sum / ct + else: + nsparse = self.sp_index.ngaps + return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) + +def make_sparse(arr, kind='block', fill_value=nan): + """ + Convert ndarray to sparse format + + Parameters + ---------- + arr : ndarray + kind : {'block', 'integer'} + fill_value : NaN or another value + + Returns + ------- + (sparse_values, index) : (ndarray, SparseIndex) + """ + arr = np.asarray(arr) + length = len(arr) + + if np.isnan(fill_value): + mask = -np.isnan(arr) + else: + mask = arr != fill_value + + indices = np.arange(length, dtype=np.int32)[mask] + + if kind == 'block': + locs, lens = splib.get_blocks(indices) + index = BlockIndex(length, locs, lens) + elif kind == 'integer': + index = IntIndex(length, indices) + else: # pragma: no cover + raise ValueError('must be block or integer type') + + sparsified_values = arr[mask] + return sparsified_values, index diff --git a/pandas/sparse/frame.py b/pandas/sparse/frame.py new file mode 100644 index 00000000..8304b6fd --- /dev/null +++ b/pandas/sparse/frame.py @@ -0,0 +1,857 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only with +float64 data +""" + +# pylint: disable=E1101,E1103,W0231,E0202 + +from numpy import nan +import numpy as np + +from pandas.core.common import _pickle_array, _unpickle_array, _try_sort +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.series import Series +from pandas.core.frame import (DataFrame, extract_index, _prep_ndarray, + _default_index) +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +import pandas.core.datetools as datetools + +from pandas.sparse.series import SparseSeries +from pandas.util.decorators import Appender + + +class _SparseMockBlockManager(object): + + def __init__(self, sp_frame): + self.sp_frame = sp_frame + + def get(self, item): + return self.sp_frame[item].values + + def iget(self, i): + return self.get(self.sp_frame.columns[i]) + + @property + def shape(self): + x, y = self.sp_frame.shape + return y, x + + @property + def axes(self): + return [self.sp_frame.columns, self.sp_frame.index] + +class SparseDataFrame(DataFrame): + """ + DataFrame containing sparse floating point data in the form of SparseSeries + objects + + Parameters + ---------- + data : same types as can be passed to DataFrame + index : array-like, optional + column : array-like, optional + default_kind : {'block', 'integer'}, default 'block' + Default sparse kind for converting Series to SparseSeries. Will not + override SparseSeries passed into constructor + default_fill_value : float + Default fill_value for converting Series to SparseSeries. Will not + override SparseSeries passed in + """ + _verbose_info = False + _columns = None + _series = None + _is_mixed_type = False + ndim = 2 + + def __init__(self, data=None, index=None, columns=None, + default_kind='block', default_fill_value=None): + if default_fill_value is None: + default_fill_value = np.nan + + self.default_kind = default_kind + self.default_fill_value = default_fill_value + + if isinstance(data, dict): + sdict, columns, index = self._init_dict(data, index, columns) + elif isinstance(data, (np.ndarray, list)): + sdict, columns, index = self._init_matrix(data, index, columns) + elif isinstance(data, DataFrame): + sdict, columns, index = self._init_dict(data, data.index, + data.columns) + elif data is None: + sdict = {} + + if index is None: + index = Index([]) + + if columns is None: + columns = Index([]) + else: + for c in columns: + sdict[c] = Series(np.nan, index=index) + + self._series = sdict + self.columns = columns + self.index = index + + def _from_axes(self, data, axes): + columns, index = axes + return self._constructor(data, index=index, columns=columns) + + @cache_readonly + def _data(self): + return _SparseMockBlockManager(self) + + def _consolidate_inplace(self): + # do nothing when DataFrame calls this method + pass + + def convert_objects(self): + # XXX + return self + + @property + def _constructor(self): + def wrapper(data, index=None, columns=None): + return SparseDataFrame(data, index=index, columns=columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + return wrapper + + def _init_dict(self, data, index, columns, dtype=None): + # pre-filter out columns if we passed it + if columns is not None: + columns = _ensure_index(columns) + data = dict((k, v) for k, v in data.iteritems() if k in columns) + else: + columns = Index(_try_sort(data.keys())) + + if index is None: + index = extract_index(data) + + sp_maker = lambda x: SparseSeries(x, index=index, + kind=self.default_kind, + fill_value=self.default_fill_value, + copy=True) + + sdict = {} + for k, v in data.iteritems(): + if isinstance(v, Series): + # Force alignment, no copy necessary + if not v.index.equals(index): + v = v.reindex(index) + + if not isinstance(v, SparseSeries): + v = sp_maker(v) + else: + if isinstance(v, dict): + v = [v.get(i, nan) for i in index] + + v = sp_maker(v) + sdict[k] = v + + # TODO: figure out how to handle this case, all nan's? + # add in any other columns we want to have (completeness) + nan_vec = np.empty(len(index)) + nan_vec.fill(nan) + for c in columns: + if c not in sdict: + sdict[c] = sp_maker(nan_vec) + + return sdict, columns, index + + def _init_matrix(self, data, index, columns, dtype=None): + data = _prep_ndarray(data, copy=False) + N, K = data.shape + if index is None: + index = _default_index(N) + if columns is None: + columns = _default_index(K) + + if len(columns) != K: + raise Exception('Column length mismatch: %d vs. %d' % + (len(columns), K)) + if len(index) != N: + raise Exception('Index length mismatch: %d vs. %d' % + (len(index), N)) + + data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)]) + return self._init_dict(data, index, columns, dtype) + + def __array_wrap__(self, result): + return SparseDataFrame(result, index=self.index, columns=self.columns, + default_kind=self.default_kind, + default_fill_value=self.default_fill_value) + + def __getstate__(self): + series = dict((k, (v.sp_index, v.sp_values)) + for k, v in self.iteritems()) + columns = self.columns + index = self.index + + return (series, columns, index, self.default_fill_value, + self.default_kind) + + def __setstate__(self, state): + series, cols, idx, fv, kind = state + + if not isinstance(cols, Index): # pragma: no cover + columns = _unpickle_array(cols) + else: + columns = cols + + if not isinstance(idx, Index): # pragma: no cover + index = _unpickle_array(idx) + else: + index = idx + + series_dict = {} + for col, (sp_index, sp_values) in series.iteritems(): + series_dict[col] = SparseSeries(sp_values, sparse_index=sp_index, + fill_value=fv) + + self._series = series_dict + self.index = index + self.columns = columns + self.default_fill_value = fv + self.default_kind = kind + + def to_dense(self): + """ + Convert to dense DataFrame + + Returns + ------- + df : DataFrame + """ + data = dict((k, v.to_dense()) for k, v in self.iteritems()) + return DataFrame(data, index=self.index) + + def astype(self, dtype): + raise NotImplementedError + + def copy(self, deep=True): + """ + Make a copy of this SparseDataFrame + """ + series = dict((k, v.copy()) for k, v in self.iteritems()) + return SparseDataFrame(series, index=self.index, columns=self.columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + @property + def density(self): + """ + Ratio of non-sparse points to total (dense) data points + represented in the frame + """ + tot_nonsparse = sum([ser.sp_index.npoints + for _, ser in self.iteritems()]) + tot = len(self.index) * len(self.columns) + return tot_nonsparse / float(tot) + + #---------------------------------------------------------------------- + # Support different internal rep'n of SparseDataFrame + + def _set_item(self, key, value): + sp_maker = lambda x: SparseSeries(x, index=self.index, + fill_value=self.default_fill_value, + kind=self.default_kind) + if hasattr(value, '__iter__'): + if isinstance(value, Series): + clean_series = value.reindex(self.index) + if not isinstance(value, SparseSeries): + clean_series = sp_maker(clean_series) + else: + clean_series = sp_maker(value) + + self._series[key] = clean_series + # Scalar + else: + self._series[key] = sp_maker(value) + + if key not in self.columns: + self._insert_column(key) + + def _insert_column(self, key): + self.columns = Index(np.concatenate((self.columns, [key]))) + + def __delitem__(self, key): + """ + Delete column from DataFrame + """ + loc = self.columns.get_loc(key) + del self._series[key] + self._delete_column_index(loc) + + def _delete_column_index(self, loc): + if loc == len(self.columns) - 1: + new_columns = self.columns[:loc] + else: + new_columns = Index(np.concatenate((self.columns[:loc], + self.columns[loc+1:]))) + self.columns = new_columns + + _index = None + def _set_index(self, index): + self._index = _ensure_index(index) + for v in self._series.values(): + v.index = self._index + + def _get_index(self): + return self._index + + def _get_columns(self): + return self._columns + + def _set_columns(self, cols): + if len(cols) != len(self._series): + raise Exception('Columns length %d did not match data %d!' % + (len(cols), len(self._series))) + self._columns = _ensure_index(cols) + + index = property(fget=_get_index, fset=_set_index) + columns = property(fget=_get_columns, fset=_set_columns) + + def __getitem__(self, item): + """ + Retrieve column or slice from DataFrame + """ + try: + # unsure about how kludgy this is + s = self._series[item] + s.name = item + return s + except (TypeError, KeyError): + if isinstance(item, slice): + date_rng = self.index[item] + return self.reindex(date_rng) + + elif isinstance(item, np.ndarray): + if len(item) != len(self.index): + raise Exception('Item wrong length %d instead of %d!' % + (len(item), len(self.index))) + newIndex = self.index[item] + return self.reindex(newIndex) + else: # pragma: no cover + raise + + @Appender(DataFrame.get_value.__doc__, indents=0) + def get_value(self, index, col): + s = self._series[col] + return s.get_value(index) + + def set_value(self, index, col, value): + """ + Put single value at passed column and index + + Parameters + ---------- + index : row label + col : column label + value : scalar value + + Notes + ----- + This method *always* returns a new object. It is currently not + particularly efficient (and potentially very expensive) but is provided + for API compatibility with DataFrame + + Returns + ------- + frame : DataFrame + """ + dense = self.to_dense().set_value(index, col, value) + return dense.to_sparse(kind=self.default_kind, + fill_value=self.default_fill_value) + + def _slice(self, slobj, axis=0): + if axis == 0: + new_index = self.index[slobj] + new_columns = self.columns + else: + new_index = self.index + new_columns = self.columns[slobj] + + return self.reindex(index=new_index, columns=new_columns) + + def as_matrix(self, columns=None): + """ + Convert the frame to its Numpy-array matrix representation + + Columns are presented in sorted order unless a specific list + of columns is provided. + """ + if columns is None: + columns = self.columns + + if len(columns) == 0: + return np.zeros((len(self.index), 0), dtype=float) + + return np.array([self[col].values for col in columns]).T + + values = property(as_matrix) + + def xs(self, key, axis=0, copy=False): + """ + Returns a row (cross-section) from the SparseDataFrame as a Series + object. + + Parameters + ---------- + key : some index contained in the index + + Returns + ------- + xs : Series + """ + if axis == 1: + data = self[key] + return data + + i = self.index.get_loc(key) + series = self._series + values = [series[k][i] for k in self.columns] + return Series(values, index=self.columns) + + #---------------------------------------------------------------------- + # Arithmetic-related methods + + def _combine_frame(self, other, func, fill_value=None, level=None): + this, other = self.align(other, join='outer', level=level, + copy=False) + new_index, new_columns = this.index, this.columns + + if level is not None: + raise NotImplementedError + + if self.empty and other.empty: + return SparseDataFrame(index=new_index) + + new_data = {} + if fill_value is not None: + # TODO: be a bit more intelligent here + for col in new_columns: + if col in this and col in other: + dleft = this[col].to_dense() + dright = other[col].to_dense() + result = dleft._binop(dright, func, fill_value=fill_value) + result = result.to_sparse(fill_value=this[col].fill_value) + new_data[col] = result + else: + for col in new_columns: + if col in this and col in other: + new_data[col] = func(this[col], other[col]) + + return self._constructor(data=new_data, index=new_index, + columns=new_columns) + + def _combine_match_index(self, other, func, fill_value=None): + new_data = {} + + if fill_value is not None: + raise NotImplementedError + + new_index = self.index.union(other.index) + this = self + if self.index is not new_index: + this = self.reindex(new_index) + + if other.index is not new_index: + other = other.reindex(new_index) + + for col, series in this.iteritems(): + new_data[col] = func(series.values, other.values) + + return self._constructor(new_data, index=new_index, + columns=self.columns) + + def _combine_match_columns(self, other, func, fill_value): + # patched version of DataFrame._combine_match_columns to account for + # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series, + # where 3.0 is numpy.float64 and series is a SparseSeries. Still + # possible for this to happen, which is bothersome + + if fill_value is not None: + raise NotImplementedError + + new_data = {} + + union = intersection = self.columns + + if not union.equals(other.index): + union = other.index.union(self.columns) + intersection = other.index.intersection(self.columns) + + for col in intersection: + new_data[col] = func(self[col], float(other[col])) + + return self._constructor(new_data, index=self.index, + columns=union) + + def _combine_const(self, other, func): + new_data = {} + for col, series in self.iteritems(): + new_data[col] = func(series, other) + + return self._constructor(data=new_data, index=self.index, + columns=self.columns) + + def _reindex_index(self, index, method, copy, level, fill_value=np.nan, + limit=None): + if level is not None: + raise Exception('Reindex by level not supported for sparse') + + if self.index.equals(index): + if copy: + return self.copy() + else: + return self + + if len(self.index) == 0: + return SparseDataFrame(index=index, columns=self.columns) + + indexer = self.index.get_indexer(index, method, limit=limit) + indexer = com._ensure_platform_int(indexer) + mask = indexer == -1 + need_mask = mask.any() + + new_series = {} + for col, series in self.iteritems(): + values = series.values + new = values.take(indexer) + + if need_mask: + np.putmask(new, mask, fill_value) + + new_series[col] = new + + return SparseDataFrame(new_series, index=index, columns=self.columns, + default_fill_value=self.default_fill_value) + + def _reindex_columns(self, columns, copy, level, fill_value, limit=None): + if level is not None: + raise Exception('Reindex by level not supported for sparse') + + if com.notnull(fill_value): + raise NotImplementedError + + if limit: + raise NotImplementedError + + # TODO: fill value handling + sdict = dict((k, v) for k, v in self.iteritems() if k in columns) + return SparseDataFrame(sdict, index=self.index, columns=columns, + default_fill_value=self.default_fill_value) + + def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer, + copy, fill_value): + if columns is None: + columns = self.columns + + new_arrays = {} + for col in columns: + if col not in self: + continue + if row_indexer is not None: + new_arrays[col] = com.take_1d(self[col].values, row_indexer, + fill_value=fill_value) + else: + new_arrays[col] = self[col] + + return self._constructor(new_arrays, index=index, columns=columns) + + def _rename_index_inplace(self, mapper): + self.index = [mapper(x) for x in self.index] + + def _rename_columns_inplace(self, mapper): + new_series = {} + new_columns = [] + + for col in self.columns: + new_col = mapper(col) + if new_col in new_series: # pragma: no cover + raise Exception('Non-unique mapping!') + new_series[new_col] = self[col] + new_columns.append(new_col) + + self.columns = new_columns + self._series = new_series + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take, return SparseDataFrame corresponding to + requested indices along an axis + + Parameters + ---------- + indices : list / array of ints + axis : {0, 1} + + Returns + ------- + taken : SparseDataFrame + """ + indices = com._ensure_platform_int(indices) + new_values = self.values.take(indices, axis=axis) + if axis == 0: + new_columns = self.columns + new_index = self.index.take(indices) + else: + new_columns = self.columns.take(indices) + new_index = self.index + return self._constructor(new_values, index=new_index, + columns=new_columns) + + def add_prefix(self, prefix): + f = (('%s' % prefix) + '%s').__mod__ + return self.rename(columns=f) + + def add_suffix(self, suffix): + f = ('%s' + ('%s' % suffix)).__mod__ + return self.rename(columns=f) + + def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', + sort=False): + if on is not None: + raise NotImplementedError + else: + return self._join_index(other, how, lsuffix, rsuffix) + + def _join_index(self, other, how, lsuffix, rsuffix): + if isinstance(other, Series): + assert(other.name is not None) + other = SparseDataFrame({other.name : other}, + default_fill_value=self.default_fill_value) + + join_index = self.index.join(other.index, how=how) + + this = self.reindex(join_index) + other = other.reindex(join_index) + + this, other = this._maybe_rename_join(other, lsuffix, rsuffix) + + result_series = this._series + other_series = other._series + result_series.update(other_series) + + return self._constructor(result_series, index=join_index) + + def _maybe_rename_join(self, other, lsuffix, rsuffix): + intersection = self.columns.intersection(other.columns) + + if len(intersection) > 0: + if not lsuffix and not rsuffix: + raise Exception('columns overlap: %s' % intersection) + + def lrenamer(x): + if x in intersection: + return '%s%s' % (x, lsuffix) + return x + + def rrenamer(x): + if x in intersection: + return '%s%s' % (x, rsuffix) + return x + + this = self.rename(columns=lrenamer) + other = other.rename(columns=rrenamer) + else: + this = self + + return this, other + + def transpose(self): + """ + Returns a DataFrame with the rows/columns switched. + """ + return SparseDataFrame(self.values.T, index=self.columns, + columns=self.index, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + T = property(transpose) + + @Appender(DataFrame.count.__doc__) + def count(self, axis=0, **kwds): + return self.apply(lambda x: x.count(), axis=axis) + + def cumsum(self, axis=0): + """ + Return SparseDataFrame of cumulative sums over requested axis. + + Parameters + ---------- + axis : {0, 1} + 0 for row-wise, 1 for column-wise + + Returns + ------- + y : SparseDataFrame + """ + return self.apply(lambda x: x.cumsum(), axis=axis) + + def shift(self, periods, freq=None, **kwds): + """ + Analogous to DataFrame.shift + """ + from pandas.core.series import _resolve_offset + + offset = _resolve_offset(freq, kwds) + + new_series = {} + if offset is None: + new_index = self.index + for col, s in self.iteritems(): + new_series[col] = s.shift(periods) + else: + new_index = self.index.shift(periods, offset) + for col, s in self.iteritems(): + new_series[col] = SparseSeries(s.sp_values, index=new_index, + sparse_index=s.sp_index, + fill_value=s.fill_value) + + return SparseDataFrame(new_series, index=new_index, + columns=self.columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + def apply(self, func, axis=0, broadcast=False): + """ + Analogous to DataFrame.apply, for SparseDataFrame + + Parameters + ---------- + func : function + Function to apply to each column + axis : {0, 1} + broadcast : bool, default False + For aggregation functions, return object of same size with values + propagated + + Returns + ------- + applied : Series or SparseDataFrame + """ + if not len(self.columns): + return self + + if isinstance(func, np.ufunc): + new_series = {} + for k, v in self.iteritems(): + applied = func(v) + applied.fill_value = func(applied.fill_value) + new_series[k] = applied + return SparseDataFrame(new_series, index=self.index, + columns=self.columns, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + else: + if not broadcast: + return self._apply_standard(func, axis) + else: + return self._apply_broadcast(func, axis) + + def applymap(self, func): + """ + Apply a function to a DataFrame that is intended to operate + elementwise, i.e. like doing map(func, series) for each series in the + DataFrame + + Parameters + ---------- + func : function + Python function, returns a single value from a single value + + Returns + ------- + applied : DataFrame + """ + return self.apply(lambda x: map(func, x)) + + @Appender(DataFrame.fillna.__doc__) + def fillna(self, value=None, method='pad', inplace=False, limit=None): + new_series = {} + for k, v in self.iterkv(): + new_series[k] = v.fillna(value=value, method=method, limit=limit) + + if inplace: + self._series = new_series + return self + else: + return self._constructor(new_series, index=self.index, + columns=self.columns) + +def stack_sparse_frame(frame): + """ + Only makes sense when fill_value is NaN + """ + lengths = [s.sp_index.npoints for _, s in frame.iteritems()] + nobs = sum(lengths) + + # this is pretty fast + minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) + + inds_to_concat = [] + vals_to_concat = [] + for _, series in frame.iteritems(): + if not np.isnan(series.fill_value): + raise Exception('This routine assumes NaN fill value') + + int_index = series.sp_index.to_int_index() + inds_to_concat.append(int_index.indices) + vals_to_concat.append(series.sp_values) + + major_labels = np.concatenate(inds_to_concat) + stacked_values = np.concatenate(vals_to_concat) + index = MultiIndex(levels=[frame.index, frame.columns], + labels=[major_labels, minor_labels]) + + lp = DataFrame(stacked_values.reshape((nobs, 1)), index=index, + columns=['foo']) + return lp.sortlevel(level=0) + + +def homogenize(series_dict): + """ + Conform a set of SparseSeries (with NaN fill_value) to a common SparseIndex + corresponding to the locations where they all have data + + Parameters + ---------- + series_dict : dict or DataFrame + + Notes + ----- + Using the dumbest algorithm I could think of. Should put some more thought + into this + + Returns + ------- + homogenized : dict of SparseSeries + """ + index = None + + need_reindex = False + + for _, series in series_dict.iteritems(): + if not np.isnan(series.fill_value): + raise Exception('this method is only valid with NaN fill values') + + if index is None: + index = series.sp_index + elif not series.sp_index.equals(index): + need_reindex = True + index = index.intersect(series.sp_index) + + if need_reindex: + output = {} + for name, series in series_dict.iteritems(): + if not series.sp_index.equals(index): + series = series.sparse_reindex(index) + + output[name] = series + else: + output = series_dict + + return output diff --git a/pandas/sparse/list.py b/pandas/sparse/list.py new file mode 100644 index 00000000..62c9d096 --- /dev/null +++ b/pandas/sparse/list.py @@ -0,0 +1,137 @@ +import numpy as np + +from pandas.sparse.array import SparseArray +import pandas._sparse as splib + +class SparseList(object): + """ + Data structure for accumulating data to be converted into a + SparseArray. Has similar API to the standard Python list + + Parameters + ---------- + data : scalar or array-like + fill_value : scalar, default NaN + """ + def __init__(self, data=None, fill_value=np.nan): + self.fill_value = fill_value + self._chunks = [] + + if data is not None: + self.append(data) + + def __repr__(self): + contents = '\n'.join(repr(c) for c in self._chunks) + return '%s\n%s' % (object.__repr__(self), contents) + + def __len__(self): + return sum(len(c) for c in self._chunks) + + def __getitem__(self, i): + if i < 0: + if i + len(self) < 0: # pragma: no cover + raise ValueError('%d out of range' % i) + i += len(self) + + passed = 0 + j = 0 + while i >= passed + len(self._chunks[j]): + passed += len(self._chunks[j]) + j += 1 + return self._chunks[j][i - passed] + + def __setitem__(self, i, value): + raise NotImplementedError + + @property + def nchunks(self): + return len(self._chunks) + + @property + def is_consolidated(self): + return self.nchunks == 1 + + def consolidate(self, inplace=True): + """ + Internally consolidate chunks of data + + Parameters + ---------- + inplace : boolean, default True + Modify the calling object instead of constructing a new one + + Returns + ------- + splist : SparseList + If inplace=False, new object, otherwise reference to existing + object + """ + if not inplace: + result = self.copy() + else: + result = self + + if result.is_consolidated: + return result + + result._consolidate_inplace() + return result + + def _consolidate_inplace(self): + new_values = np.concatenate([c.sp_values for c in self._chunks]) + new_index = _concat_sparse_indexes([c.sp_index for c in self._chunks]) + new_arr = SparseArray(new_values, sparse_index=new_index, + fill_value=self.fill_value) + self._chunks = [new_arr] + + def copy(self): + """ + Return copy of the list + + Returns + ------- + new_list : SparseList + """ + new_splist = SparseList(fill_value=self.fill_value) + new_splist._chunks = list(self._chunks) + return new_splist + + def to_array(self): + """ + Return SparseArray from data stored in the SparseList + + Returns + ------- + sparr : SparseArray + """ + self.consolidate(inplace=True) + return self._chunks[0] + + def append(self, value): + """ + Append element or array-like chunk of data to the SparseList + + Parameters + ---------- + value: scalar or array-like + """ + if np.isscalar(value): + value = [value] + + sparr = SparseArray(value, fill_value=self.fill_value) + self._chunks.append(sparr) + self._consolidated = False + + +def _concat_sparse_indexes(indexes): + all_indices = [] + total_length = 0 + + for index in indexes: + # increment by offset + inds = index.to_int_index().indices + total_length + + all_indices.append(inds) + total_length += index.length + + return splib.IntIndex(total_length, np.concatenate(all_indices)) diff --git a/pandas/sparse/panel.py b/pandas/sparse/panel.py new file mode 100644 index 00000000..b843b653 --- /dev/null +++ b/pandas/sparse/panel.py @@ -0,0 +1,496 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only with +float64 data +""" + +# pylint: disable=E1101,E1103,W0231 + +import numpy as np + +from pandas.core.index import Index, MultiIndex, _ensure_index +from pandas.core.frame import DataFrame +from pandas.core.panel import Panel +from pandas.sparse.frame import SparseDataFrame +from pandas.util.decorators import deprecate + +import pandas.core.common as com + +class SparsePanelAxis(object): + + def __init__(self, cache_field, frame_attr): + self.cache_field = cache_field + self.frame_attr = frame_attr + + def __get__(self, obj, type=None): + return getattr(obj, self.cache_field, None) + + def __set__(self, obj, value): + value = _ensure_index(value) + + if isinstance(value, MultiIndex): + raise NotImplementedError + + for v in obj._frames.itervalues(): + setattr(v, self.frame_attr, value) + + setattr(obj, self.cache_field, value) + + +class SparsePanel(Panel): + """ + Sparse version of Panel + + Parameters + ---------- + frames : dict of DataFrame objects + items : array-like + major_axis : array-like + minor_axis : array-like + default_kind : {'block', 'integer'}, default 'block' + Default sparse kind for converting Series to SparseSeries. Will not + override SparseSeries passed into constructor + default_fill_value : float + Default fill_value for converting Series to SparseSeries. Will not + override SparseSeries passed in + + Notes + ----- + """ + ndim = 3 + + def __init__(self, frames, items=None, major_axis=None, minor_axis=None, + default_fill_value=np.nan, default_kind='block'): + if isinstance(frames, np.ndarray): + new_frames = {} + for item, vals in zip(items, frames): + new_frames[item] = \ + SparseDataFrame(vals, index=major_axis, + columns=minor_axis, + default_fill_value=default_fill_value, + default_kind=default_kind) + frames = new_frames + + assert(isinstance(frames, dict)) + + self.default_fill_value = fill_value = default_fill_value + self.default_kind = kind = default_kind + + # pre-filter, if necessary + if items is None: + items = Index(sorted(frames.keys())) + items = _ensure_index(items) + + (clean_frames, + major_axis, + minor_axis) = _convert_frames(frames, major_axis, + minor_axis, kind=kind, + fill_value=fill_value) + + self._frames = clean_frames + + # do we want to fill missing ones? + for item in items: + if item not in clean_frames: + raise Exception('column %s not found in data' % item) + + self._items = items + self.major_axis = major_axis + self.minor_axis = minor_axis + + def _consolidate_inplace(self): # pragma: no cover + # do nothing when DataFrame calls this method + pass + + def __array_wrap__(self, result): + return SparsePanel(result, items=self.items, + major_axis=self.major_axis, + minor_axis=self.minor_axis, + default_kind=self.default_kind, + default_fill_value=self.default_fill_value) + + @classmethod + def from_dict(cls, data): + """ + Analogous to Panel.from_dict + """ + return SparsePanel(data) + + def to_dense(self): + """ + Convert SparsePanel to (dense) Panel + + Returns + ------- + dense : Panel + """ + return Panel(self.values, self.items, self.major_axis, + self.minor_axis) + + @property + def values(self): + # return dense values + return np.array([self._frames[item].values + for item in self.items]) + + # need a special property for items to make the field assignable + + _items = None + def _get_items(self): + return self._items + + def _set_items(self, new_items): + new_items = _ensure_index(new_items) + if isinstance(new_items, MultiIndex): + raise NotImplementedError + + # need to create new frames dict + + old_frame_dict = self._frames + old_items = self._items + self._frames = dict((new_k, old_frame_dict[old_k]) + for new_k, old_k in zip(new_items, old_items)) + self._items = new_items + items = property(fget=_get_items, fset=_set_items) + + # DataFrame's index + major_axis = SparsePanelAxis('_major_axis', 'index') + + # DataFrame's columns / "items" + minor_axis = SparsePanelAxis('_minor_axis', 'columns') + + def _get_item_cache(self, key): + return self._frames[key] + + def __setitem__(self, key, value): + if isinstance(value, DataFrame): + value = value.reindex(index=self.major_axis, + columns=self.minor_axis) + if not isinstance(value, SparseDataFrame): + value = value.to_sparse(fill_value=self.default_fill_value, + kind=self.default_kind) + else: + raise ValueError('only DataFrame objects can be set currently') + + self._frames[key] = value + + if key not in self.items: + self._items = Index(list(self.items) + [key]) + + def set_value(self, item, major, minor, value): + """ + Quickly set single value at (item, major, minor) location + + Parameters + ---------- + item : item label (panel item) + major : major axis label (panel item row) + minor : minor axis label (panel item column) + value : scalar + + Notes + ----- + This method *always* returns a new object. It is not particularly + efficient but is provided for API compatibility with Panel + + Returns + ------- + panel : SparsePanel + """ + dense = self.to_dense().set_value(item, major, minor, value) + return dense.to_sparse(kind=self.default_kind, + fill_value=self.default_fill_value) + + def __delitem__(self, key): + loc = self.items.get_loc(key) + indices = range(loc) + range(loc + 1, len(self.items)) + del self._frames[key] + self._items = self._items.take(indices) + + def __getstate__(self): + # pickling + return (self._frames, com._pickle_array(self.items), + com._pickle_array(self.major_axis), + com._pickle_array(self.minor_axis), + self.default_fill_value, self.default_kind) + + def __setstate__(self, state): + frames, items, major, minor, fv, kind = state + + self.default_fill_value = fv + self.default_kind = kind + self._items = _ensure_index(com._unpickle_array(items)) + self._major_axis = _ensure_index(com._unpickle_array(major)) + self._minor_axis = _ensure_index(com._unpickle_array(minor)) + self._frames = frames + + def copy(self): + """ + Make a (shallow) copy of the sparse panel + + Returns + ------- + copy : SparsePanel + """ + return SparsePanel(self._frames.copy(), items=self.items, + major_axis=self.major_axis, + minor_axis=self.minor_axis, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + def to_frame(self, filter_observations=True): + """ + Convert SparsePanel to (dense) DataFrame + + Returns + ------- + frame : DataFrame + """ + if not filter_observations: + raise Exception('filter_observations=False not supported for ' + 'SparsePanel.to_long') + + I, N, K = self.shape + counts = np.zeros(N * K, dtype=int) + + d_values = {} + d_indexer = {} + + for item in self.items: + frame = self[item] + + values, major, minor = _stack_sparse_info(frame) + + # values are stacked column-major + indexer = minor * N + major + counts.put(indexer, counts.take(indexer) + 1) # cuteness + + d_values[item] = values + d_indexer[item] = indexer + + # have full set of observations for each item + mask = counts == I + + # for each item, take mask values at index locations for those sparse + # values, and use that to select values + values = np.column_stack([d_values[item][mask.take(d_indexer[item])] + for item in self.items]) + + inds, = mask.nonzero() + + # still column major + major_labels = inds % N + minor_labels = inds // N + + index = MultiIndex(levels=[self.major_axis, self.minor_axis], + labels=[major_labels, minor_labels]) + + df = DataFrame(values, index=index, columns=self.items) + return df.sortlevel(level=0) + + to_long = deprecate('to_long', to_frame) + toLong = deprecate('toLong', to_frame) + + def reindex(self, major=None, items=None, minor=None, major_axis=None, + minor_axis=None, copy=False): + """ + Conform / reshape panel axis labels to new input labels + + Parameters + ---------- + major : array-like, default None + items : array-like, default None + minor : array-like, default None + copy : boolean, default False + Copy underlying SparseDataFrame objects + + Returns + ------- + reindexed : SparsePanel + """ + major = com._mut_exclusive(major, major_axis) + minor = com._mut_exclusive(minor, minor_axis) + + if com._all_none(items, major, minor): + raise ValueError('Must specify at least one axis') + + major = self.major_axis if major is None else major + minor = self.minor_axis if minor is None else minor + + if items is not None: + new_frames = {} + for item in items: + if item in self._frames: + new_frames[item] = self._frames[item] + else: + raise Exception('Reindexing with new items not yet ' + 'supported') + else: + new_frames = self._frames + + if copy: + new_frames = dict((k, v.copy()) for k, v in new_frames.iteritems()) + + return SparsePanel(new_frames, items=items, + major_axis=major, + minor_axis=minor, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + def _combine(self, other, func, axis=0): + if isinstance(other, DataFrame): + return self._combineFrame(other, func, axis=axis) + elif isinstance(other, Panel): + return self._combinePanel(other, func) + elif np.isscalar(other): + new_frames = dict((k, func(v, other)) + for k, v in self.iterkv()) + return self._new_like(new_frames) + + def _combineFrame(self, other, func, axis=0): + index, columns = self._get_plane_axes(axis) + axis = self._get_axis_number(axis) + + other = other.reindex(index=index, columns=columns) + + if axis == 0: + new_values = func(self.values, other.values) + elif axis == 1: + new_values = func(self.values.swapaxes(0, 1), other.values.T) + new_values = new_values.swapaxes(0, 1) + elif axis == 2: + new_values = func(self.values.swapaxes(0, 2), other.values) + new_values = new_values.swapaxes(0, 2) + + # TODO: make faster! + new_frames = {} + for item, item_slice in zip(self.items, new_values): + old_frame = self[item] + ofv = old_frame.default_fill_value + ok = old_frame.default_kind + new_frames[item] = SparseDataFrame(item_slice, + index=self.major_axis, + columns=self.minor_axis, + default_fill_value=ofv, + default_kind=ok) + + return self._new_like(new_frames) + + def _new_like(self, new_frames): + return SparsePanel(new_frames, self.items, self.major_axis, + self.minor_axis, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + + def _combinePanel(self, other, func): + items = self.items + other.items + major = self.major_axis + other.major_axis + minor = self.minor_axis + other.minor_axis + + # could check that everything's the same size, but forget it + + this = self.reindex(items=items, major=major, minor=minor) + other = other.reindex(items=items, major=major, minor=minor) + + new_frames = {} + for item in items: + new_frames[item] = func(this[item], other[item]) + + if not isinstance(other, SparsePanel): + new_default_fill = self.default_fill_value + else: + # maybe unnecessary + new_default_fill = func(self.default_fill_value, + other.default_fill_value) + + return SparsePanel(new_frames, items, major, minor, + default_fill_value=new_default_fill, + default_kind=self.default_kind) + + def major_xs(self, key): + """ + Return slice of panel along major axis + + Parameters + ---------- + key : object + Major axis label + + Returns + ------- + y : DataFrame + index -> minor axis, columns -> items + """ + slices = dict((k, v.xs(key)) for k, v in self.iterkv()) + return DataFrame(slices, index=self.minor_axis, columns=self.items) + + def minor_xs(self, key): + """ + Return slice of panel along minor axis + + Parameters + ---------- + key : object + Minor axis label + + Returns + ------- + y : SparseDataFrame + index -> major axis, columns -> items + """ + slices = dict((k, v[key]) for k, v in self.iterkv()) + return SparseDataFrame(slices, index=self.major_axis, + columns=self.items, + default_fill_value=self.default_fill_value, + default_kind=self.default_kind) + +SparseWidePanel = SparsePanel + +def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'): + from pandas.core.panel import _get_combined_index + output = {} + for item, df in frames.iteritems(): + if not isinstance(df, SparseDataFrame): + df = SparseDataFrame(df, default_kind=kind, + default_fill_value=fill_value) + + output[item] = df + + if index is None: + all_indexes = [df.index for df in output.values()] + index = _get_combined_index(all_indexes) + if columns is None: + all_columns = [df.columns for df in output.values()] + columns = _get_combined_index(all_columns) + + index = _ensure_index(index) + columns = _ensure_index(columns) + + for item, df in output.iteritems(): + if not (df.index.equals(index) and df.columns.equals(columns)): + output[item] = df.reindex(index=index, columns=columns) + + return output, index, columns + + +def _stack_sparse_info(frame): + lengths = [s.sp_index.npoints for _, s in frame.iteritems()] + + # this is pretty fast + minor_labels = np.repeat(np.arange(len(frame.columns)), lengths) + + inds_to_concat = [] + vals_to_concat = [] + for col in frame.columns: + series = frame[col] + + if not np.isnan(series.fill_value): + raise Exception('This routine assumes NaN fill value') + + int_index = series.sp_index.to_int_index() + inds_to_concat.append(int_index.indices) + vals_to_concat.append(series.sp_values) + + major_labels = np.concatenate(inds_to_concat) + sparse_values = np.concatenate(vals_to_concat) + + return sparse_values, major_labels, minor_labels diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py new file mode 100644 index 00000000..dfe78a81 --- /dev/null +++ b/pandas/sparse/series.py @@ -0,0 +1,544 @@ +""" +Data structures for sparse float data. Life is made simpler by dealing only with +float64 data +""" + +# pylint: disable=E1101,E1103,W0231 + +from numpy import nan, ndarray +import numpy as np + +import operator + +from pandas.core.common import isnull +from pandas.core.index import Index, _ensure_index +from pandas.core.series import Series, TimeSeries, _maybe_match_name +from pandas.core.frame import DataFrame +import pandas.core.common as common +import pandas.core.datetools as datetools + +from pandas.util import py3compat + +from pandas.sparse.array import (make_sparse, _sparse_array_op, SparseArray) +from pandas._sparse import BlockIndex, IntIndex +import pandas._sparse as splib + +from pandas.util.decorators import Appender + +#------------------------------------------------------------------------------- +# Wrapper function for Series arithmetic methods + +def _sparse_op_wrap(op, name): + """ + Wrapper function for Series arithmetic operations, to avoid + code duplication. + """ + def wrapper(self, other): + if isinstance(other, Series): + if not isinstance(other, SparseSeries): + other = other.to_sparse(fill_value=self.fill_value) + return _sparse_series_op(self, other, op, name) + elif isinstance(other, DataFrame): + return NotImplemented + elif np.isscalar(other): + new_fill_value = op(np.float64(self.fill_value), + np.float64(other)) + + return SparseSeries(op(self.sp_values, other), + index=self.index, + sparse_index=self.sp_index, + fill_value=new_fill_value, + name=self.name) + else: # pragma: no cover + raise TypeError('operation with %s not supported' % type(other)) + + wrapper.__name__ = name + return wrapper + +def _sparse_series_op(left, right, op, name): + left, right = left.align(right, join='outer', copy=False) + new_index = left.index + new_name = _maybe_match_name(left, right) + + result = _sparse_array_op(left, right, op, name) + result = result.view(SparseSeries) + result.index = new_index + result.name = new_name + + return result + +class SparseSeries(SparseArray, Series): + __array_priority__ = 15 + + sp_index = None + fill_value = None + + def __new__(cls, data, index=None, sparse_index=None, kind='block', + fill_value=None, name=None, copy=False): + + is_sparse_array = isinstance(data, SparseArray) + if fill_value is None: + if is_sparse_array: + fill_value = data.fill_value + else: + fill_value = nan + + if is_sparse_array: + if isinstance(data, SparseSeries) and index is None: + index = data.index + elif index is not None: + assert(len(index) == len(data)) + + sparse_index = data.sp_index + values = np.asarray(data) + elif isinstance(data, (Series, dict)): + if index is None: + index = data.index + + data = Series(data) + values, sparse_index = make_sparse(data, kind=kind, + fill_value=fill_value) + elif np.isscalar(data): # pragma: no cover + if index is None: + raise Exception('must pass index!') + + values = np.empty(len(index)) + values.fill(data) + + # TODO: more efficient + + values, sparse_index = make_sparse(values, kind=kind, + fill_value=fill_value) + + else: + # array-like + if sparse_index is None: + values, sparse_index = make_sparse(data, kind=kind, + fill_value=fill_value) + else: + values = data + assert(len(values) == sparse_index.npoints) + + if index is None: + index = Index(np.arange(sparse_index.length)) + index = _ensure_index(index) + + # Create array, do *not* copy data by default + if copy: + subarr = np.array(values, dtype=np.float64, copy=True) + else: + subarr = np.asarray(values, dtype=np.float64) + + if index.is_all_dates: + cls = SparseTimeSeries + + # Change the class of the array to be the subclass type. + output = subarr.view(cls) + output.sp_index = sparse_index + output.fill_value = np.float64(fill_value) + output.index = index + output.name = name + return output + + def __init__(self, data, index=None, sparse_index=None, kind='block', + fill_value=None, name=None, copy=False): + """Data structure for labeled, sparse floating point data + +Parameters +---------- +data : {array-like, Series, SparseSeries, dict} +kind : {'block', 'integer'} +fill_value : float + Defaults to NaN (code for missing) +sparse_index : {BlockIndex, IntIndex}, optional + Only if you have one. Mainly used internally + +Notes +----- +SparseSeries objects are immutable via the typical Python means. If you +must change values, convert to dense, make your changes, then convert back +to sparse + """ + pass + + @property + def _constructor(self): + def make_sp_series(data, index=None, name=None): + return SparseSeries(data, index=index, fill_value=self.fill_value, + kind=self.kind, name=name) + + return make_sp_series + + @property + def kind(self): + if isinstance(self.sp_index, BlockIndex): + return 'block' + elif isinstance(self.sp_index, IntIndex): + return 'integer' + + def __array_finalize__(self, obj): + """ + Gets called after any ufunc or other array operations, necessary + to pass on the index. + """ + self._index = getattr(obj, '_index', None) + self.name = getattr(obj, 'name', None) + self.sp_index = getattr(obj, 'sp_index', None) + self.fill_value = getattr(obj, 'fill_value', None) + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(ndarray.__reduce__(self)) + + subclass_state = (self.index, self.fill_value, self.sp_index, + self.name) + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + nd_state, own_state = state + ndarray.__setstate__(self, nd_state) + + + index, fill_value, sp_index = own_state[:3] + name = None + if len(own_state) > 3: + name = own_state[3] + + self.sp_index = sp_index + self.fill_value = fill_value + self.index = index + self.name = name + + def __len__(self): + return self.sp_index.length + + def __repr__(self): + series_rep = Series.__repr__(self) + rep = '%s\n%s' % (series_rep, repr(self.sp_index)) + return rep + + # Arithmetic operators + + __add__ = _sparse_op_wrap(operator.add, 'add') + __sub__ = _sparse_op_wrap(operator.sub, 'sub') + __mul__ = _sparse_op_wrap(operator.mul, 'mul') + __truediv__ = _sparse_op_wrap(operator.truediv, 'truediv') + __floordiv__ = _sparse_op_wrap(operator.floordiv, 'floordiv') + __pow__ = _sparse_op_wrap(operator.pow, 'pow') + + # reverse operators + __radd__ = _sparse_op_wrap(operator.add, '__radd__') + __rsub__ = _sparse_op_wrap(lambda x, y: y - x, '__rsub__') + __rmul__ = _sparse_op_wrap(operator.mul, '__rmul__') + __rtruediv__ = _sparse_op_wrap(lambda x, y: y / x, '__rtruediv__') + __rfloordiv__ = _sparse_op_wrap(lambda x, y: y // x, 'floordiv') + __rpow__ = _sparse_op_wrap(lambda x, y: y ** x, '__rpow__') + + # Python 2 division operators + if not py3compat.PY3: + __div__ = _sparse_op_wrap(operator.div, 'div') + __rdiv__ = _sparse_op_wrap(lambda x, y: y / x, '__rdiv__') + + def __getitem__(self, key): + """ + + """ + try: + return self._get_val_at(self.index.get_loc(key)) + + except KeyError: + if isinstance(key, (int, np.integer)): + return self._get_val_at(key) + raise Exception('Requested index not in this series!') + + except TypeError: + # Could not hash item, must be array-like? + pass + + # is there a case where this would NOT be an ndarray? + # need to find an example, I took out the case for now + + dataSlice = self.values[key] + new_index = Index(self.index.view(ndarray)[key]) + return self._constructor(dataSlice, index=new_index, name=self.name) + + def abs(self): + """ + Return an object with absolute value taken. Only applicable to objects + that are all numeric + + Returns + ------- + abs: type of caller + """ + res_sp_values = np.abs(self.sp_values) + return SparseSeries(res_sp_values, index=self.index, + sparse_index=self.sp_index, + fill_value=self.fill_value) + + def get(self, label, default=None): + """ + Returns value occupying requested label, default to specified + missing value if not present. Analogous to dict.get + + Parameters + ---------- + label : object + Label value looking for + default : object, optional + Value to return if label not in index + + Returns + ------- + y : scalar + """ + if label in self.index: + loc = self.index.get_loc(label) + return self._get_val_at(loc) + else: + return default + + def get_value(self, label): + """ + Retrieve single value at passed index label + + Parameters + ---------- + index : label + + Returns + ------- + value : scalar value + """ + loc = self.index.get_loc(label) + return self._get_val_at(loc) + + def set_value(self, label, value): + """ + Quickly set single value at passed label. If label is not contained, a + new object is created with the label placed at the end of the result + index + + Parameters + ---------- + label : object + Partial indexing with MultiIndex not allowed + value : object + Scalar value + + Notes + ----- + This method *always* returns a new object. It is not particularly + efficient but is provided for API compatibility with Series + + Returns + ------- + series : SparseSeries + """ + dense = self.to_dense().set_value(label, value) + return dense.to_sparse(kind=self.kind, fill_value=self.fill_value) + + def to_dense(self, sparse_only=False): + """ + Convert SparseSeries to (dense) Series + """ + if sparse_only: + int_index = self.sp_index.to_int_index() + index = self.index.take(int_index.indices) + return Series(self.sp_values, index=index, name=self.name) + else: + return Series(self.values, index=self.index, name=self.name) + + def astype(self, dtype=None): + """ + + """ + if dtype is not None and dtype not in (np.float_, float): + raise Exception('Can only support floating point data') + + return self.copy() + + def copy(self, deep=True): + """ + Make a copy of the SparseSeries. Only the actual sparse values need to + be copied + """ + if deep: + values = self.sp_values.copy() + else: + values = self.sp_values + return SparseSeries(values, index=self.index, + sparse_index=self.sp_index, + fill_value=self.fill_value, name=self.name) + + def reindex(self, index=None, method=None, copy=True, limit=None): + """ + Conform SparseSeries to new Index + + See Series.reindex docstring for general behavior + + Returns + ------- + reindexed : SparseSeries + """ + new_index = _ensure_index(index) + + if self.index.equals(new_index): + if copy: + return self.copy() + else: + return self + + if len(self.index) == 0: + # FIXME: inelegant / slow + values = np.empty(len(new_index), dtype=np.float64) + values.fill(nan) + return SparseSeries(values, index=new_index, + fill_value=self.fill_value) + + new_index, fill_vec = self.index.reindex(index, method=method, + limit=limit) + new_values = common.take_1d(self.values, fill_vec) + return SparseSeries(new_values, index=new_index, + fill_value=self.fill_value, name=self.name) + + def sparse_reindex(self, new_index): + """ + Conform sparse values to new SparseIndex + + Parameters + ---------- + new_index : {BlockIndex, IntIndex} + + Returns + ------- + reindexed : SparseSeries + """ + assert(isinstance(new_index, splib.SparseIndex)) + + new_values = self.sp_index.to_int_index().reindex(self.sp_values, + self.fill_value, + new_index) + return SparseSeries(new_values, index=self.index, + sparse_index=new_index, + fill_value=self.fill_value) + + @Appender(Series.fillna.__doc__) + def fillna(self, value=None, method='pad', inplace=False, limit=None): + dense = self.to_dense() + filled = dense.fillna(value=value, method=method, limit=limit) + result = filled.to_sparse(kind=self.kind, + fill_value=self.fill_value) + + if inplace: + self.sp_values[:] = result.values + return self + else: + return result + + def take(self, indices, axis=0): + """ + Sparse-compatible version of ndarray.take + + Returns + ------- + taken : ndarray + """ + new_values = SparseArray.take(self, indices) + new_index = self.index.take(indices) + return self._constructor(new_values, index=new_index) + + def cumsum(self, axis=0, dtype=None, out=None): + """ + Cumulative sum of values. Preserves locations of NaN values + + Extra parameters are to preserve ndarray interface. + + Returns + ------- + cumsum : Series or SparseSeries + """ + result = SparseArray.cumsum(self) + if isinstance(result, SparseArray): + result = self._attach_meta(result) + return result + + def _attach_meta(self, sparse_arr): + sparse_series = sparse_arr.view(SparseSeries) + sparse_series.index = self.index + sparse_series.name = self.name + return sparse_series + + def dropna(self): + """ + Analogous to Series.dropna. If fill_value=NaN, returns a dense Series + """ + # TODO: make more efficient + dense_valid = self.to_dense().valid() + if isnull(self.fill_value): + return dense_valid + else: + return dense_valid.to_sparse(fill_value=self.fill_value) + + def shift(self, periods, freq=None, **kwds): + """ + Analogous to Series.shift + """ + from pandas.core.series import _resolve_offset + + offset = _resolve_offset(freq, kwds) + + # no special handling of fill values yet + if not isnull(self.fill_value): + dense_shifted = self.to_dense().shift(periods, freq=freq, + **kwds) + return dense_shifted.to_sparse(fill_value=self.fill_value, + kind=self.kind) + + if periods == 0: + return self.copy() + + if offset is not None: + return SparseSeries(self.sp_values, + sparse_index=self.sp_index, + index=self.index.shift(periods, offset), + fill_value=self.fill_value) + + int_index = self.sp_index.to_int_index() + new_indices = int_index.indices + periods + start, end = new_indices.searchsorted([0, int_index.length]) + + new_indices = new_indices[start:end] + + new_sp_index = IntIndex(len(self), new_indices) + if isinstance(self.sp_index, BlockIndex): + new_sp_index = new_sp_index.to_block_index() + + return SparseSeries(self.sp_values[start:end].copy(), + index=self.index, + sparse_index=new_sp_index, + fill_value=self.fill_value) + + def combine_first(self, other): + """ + Combine Series values, choosing the calling Series's values + first. Result index will be the union of the two indexes + + Parameters + ---------- + other : Series + + Returns + ------- + y : Series + """ + if isinstance(other, SparseSeries): + other = other.to_dense() + + dense_combined = self.to_dense().combine_first(other) + return dense_combined.to_sparse(fill_value=self.fill_value) + +class SparseTimeSeries(SparseSeries, TimeSeries): + pass diff --git a/pandas/sparse/tests/__init__.py b/pandas/sparse/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/sparse/tests/test_array.py b/pandas/sparse/tests/test_array.py new file mode 100644 index 00000000..45973214 --- /dev/null +++ b/pandas/sparse/tests/test_array.py @@ -0,0 +1,154 @@ +from numpy import nan, ndarray +import numpy as np + +import operator +import pickle +import unittest + +from pandas.core.series import Series +from pandas.core.common import notnull +from pandas.sparse.api import SparseArray +from pandas.util.testing import assert_almost_equal + +def assert_sp_array_equal(left, right): + assert_almost_equal(left.sp_values, right.sp_values) + assert(left.sp_index.equals(right.sp_index)) + if np.isnan(left.fill_value): + assert(np.isnan(right.fill_value)) + else: + assert(left.fill_value == right.fill_value) + + +class TestSparseArray(unittest.TestCase): + + def setUp(self): + self.arr_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) + self.arr = SparseArray(self.arr_data) + self.zarr = SparseArray([0, 0, 1, 2, 3, 0, 4, 5, 0, 6], fill_value=0) + + def test_constructor_from_sparse(self): + res = SparseArray(self.zarr) + self.assertEquals(res.fill_value, 0) + assert_almost_equal(res.sp_values, self.zarr.sp_values) + + def test_constructor_copy(self): + cp = SparseArray(self.arr, copy=True) + cp.sp_values[:3] = 0 + self.assert_(not (self.arr.sp_values[:3] == 0).any()) + + not_copy = SparseArray(self.arr) + not_copy.sp_values[:3] = 0 + self.assert_((self.arr.sp_values[:3] == 0).all()) + + def test_astype(self): + res = self.arr.astype('f8') + res.sp_values[:3] = 27 + self.assert_(not (self.arr.sp_values[:3] == 27).any()) + + self.assertRaises(Exception, self.arr.astype, 'i8') + + def test_copy_shallow(self): + arr2 = self.arr.copy(deep=False) + + def _get_base(values): + base = values.base + while base.base is not None: + base = base.base + return base + + assert(_get_base(arr2) is _get_base(self.arr)) + + def test_values_asarray(self): + assert_almost_equal(self.arr.values, self.arr_data) + assert_almost_equal(self.arr.to_dense(), self.arr_data) + assert_almost_equal(self.arr.sp_values, np.asarray(self.arr)) + + def test_getitem(self): + def _checkit(i): + assert_almost_equal(self.arr[i], self.arr.values[i]) + + for i in range(len(self.arr)): + _checkit(i) + _checkit(-i) + + def test_getslice(self): + result = self.arr[:-3] + exp = SparseArray(self.arr.values[:-3]) + assert_sp_array_equal(result, exp) + + result = self.arr[-4:] + exp = SparseArray(self.arr.values[-4:]) + assert_sp_array_equal(result, exp) + + # two corner cases from Series + result = self.arr[-12:] + exp = SparseArray(self.arr) + assert_sp_array_equal(result, exp) + + result = self.arr[:-12] + exp = SparseArray(self.arr.values[:0]) + assert_sp_array_equal(result, exp) + + def test_binary_operators(self): + data1 = np.random.randn(20) + data2 = np.random.randn(20) + data1[::2] = np.nan + data2[::3] = np.nan + + arr1 = SparseArray(data1) + arr2 = SparseArray(data2) + + data1[::2] = 3 + data2[::3] = 3 + farr1 = SparseArray(data1, fill_value=3) + farr2 = SparseArray(data2, fill_value=3) + + def _check_op(op, first, second): + res = op(first, second) + exp = SparseArray(op(first.values, second.values), + fill_value=first.fill_value) + self.assert_(isinstance(res, SparseArray)) + assert_almost_equal(res.values, exp.values) + + res2 = op(first, second.values) + self.assert_(isinstance(res2, SparseArray)) + assert_sp_array_equal(res, res2) + + res3 = op(first.values, second) + self.assert_(isinstance(res3, SparseArray)) + assert_sp_array_equal(res, res3) + + res4 = op(first, 4) + self.assert_(isinstance(res4, SparseArray)) + exp = op(first.values, 4) + exp_fv = op(first.fill_value, 4) + assert_almost_equal(res4.fill_value, exp_fv) + assert_almost_equal(res4.values, exp) + + def _check_inplace_op(op): + tmp = arr1.copy() + self.assertRaises(NotImplementedError, op, tmp, arr2) + + bin_ops = [operator.add, operator.sub, operator.mul, operator.truediv, + operator.floordiv, operator.pow] + for op in bin_ops: + _check_op(op, arr1, arr2) + _check_op(op, farr1, farr2) + + inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', 'ipow'] + for op in inplace_ops: + _check_inplace_op(getattr(operator, op)) + + def test_pickle(self): + def _check_roundtrip(obj): + pickled = pickle.dumps(obj) + unpickled = pickle.loads(pickled) + assert_sp_array_equal(unpickled, obj) + + _check_roundtrip(self.arr) + _check_roundtrip(self.zarr) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/sparse/tests/test_libsparse.py b/pandas/sparse/tests/test_libsparse.py new file mode 100644 index 00000000..2ff41753 --- /dev/null +++ b/pandas/sparse/tests/test_libsparse.py @@ -0,0 +1,365 @@ +from unittest import TestCase + +from pandas import Series + +import nose +from numpy import nan +import numpy as np +import operator +from numpy.testing import assert_almost_equal, assert_equal + +from pandas.core.sparse import SparseSeries +from pandas import DataFrame + +from pandas._sparse import IntIndex, BlockIndex +import pandas._sparse as splib + +TEST_LENGTH = 20 + +plain_case = dict(xloc = [0, 7, 15], + xlen = [3, 5, 5], + yloc = [2, 9, 14], + ylen = [2, 3, 5], + intersect_loc = [2, 9, 15], + intersect_len = [1, 3, 4]) +delete_blocks = dict(xloc = [0, 5], + xlen = [4, 4], + yloc = [1], + ylen = [4], + intersect_loc = [1], + intersect_len = [3]) +split_blocks = dict(xloc = [0], + xlen = [10], + yloc = [0, 5], + ylen = [3, 7], + intersect_loc = [0, 5], + intersect_len = [3, 5]) +skip_block = dict(xloc = [10], + xlen = [5], + yloc = [0, 12], + ylen = [5, 3], + intersect_loc = [12], + intersect_len = [3]) + +no_intersect = dict(xloc = [0, 10], + xlen = [4, 6], + yloc = [5, 17], + ylen = [4, 2], + intersect_loc = [], + intersect_len = []) + +def check_cases(_check_case): + def _check_case_dict(case): + _check_case(case['xloc'], case['xlen'], case['yloc'], case['ylen'], + case['intersect_loc'], case['intersect_len']) + + _check_case_dict(plain_case) + _check_case_dict(delete_blocks) + _check_case_dict(split_blocks) + _check_case_dict(skip_block) + _check_case_dict(no_intersect) + + # one or both is empty + _check_case([0], [5], [], [], [], []) + _check_case([], [], [], [], [], []) + +def test_index_make_union(): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + bresult = xindex.make_union(yindex) + assert(isinstance(bresult, BlockIndex)) + assert_equal(bresult.blocs, eloc) + assert_equal(bresult.blengths, elen) + + ixindex = xindex.to_int_index() + iyindex = yindex.to_int_index() + iresult = ixindex.make_union(iyindex) + assert(isinstance(iresult, IntIndex)) + assert_equal(iresult.indices, bresult.to_int_index().indices) + + """ + x: ---- + y: ---- + r: -------- + """ + xloc = [0]; xlen = [5] + yloc = [5]; ylen = [4] + eloc = [0]; elen = [9] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ----- ----- + y: ----- -- + """ + xloc = [0, 10]; xlen = [5, 5] + yloc = [2, 17]; ylen = [5, 2] + eloc = [0, 10, 17]; elen = [7, 5, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ------ + y: ------- + r: ---------- + """ + xloc = [1]; xlen = [5] + yloc = [3]; ylen = [5] + eloc = [1]; elen = [7] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ------ ----- + y: ------- + r: ------------- + """ + xloc = [2, 10]; xlen = [4, 4] + yloc = [4]; ylen = [8] + eloc = [2]; elen = [12] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: --- ----- + y: ------- + r: ------------- + """ + xloc = [0, 5]; xlen = [3, 5] + yloc = [0]; ylen = [7] + eloc = [0]; elen = [10] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ------ ----- + y: ------- --- + r: ------------- + """ + xloc = [2, 10]; xlen = [4, 4] + yloc = [4, 13]; ylen = [8, 4] + eloc = [2]; elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ---------------------- + y: ---- ---- --- + r: ---------------------- + """ + xloc = [2]; xlen = [15] + yloc = [4, 9, 14]; ylen = [3, 2, 2] + eloc = [2]; elen = [15] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + """ + x: ---- --- + y: --- --- + """ + xloc = [0, 10]; xlen = [3, 3] + yloc = [5, 15]; ylen = [2, 2] + eloc = [0, 5, 10, 15]; elen = [3, 2, 3, 2] + _check_case(xloc, xlen, yloc, ylen, eloc, elen) + + # TODO: different-length index objects + +def test_lookup(): + + def _check(index): + assert(index.lookup(0) == -1) + assert(index.lookup(5) == 0) + assert(index.lookup(7) == 2) + assert(index.lookup(8) == -1) + assert(index.lookup(9) == -1) + assert(index.lookup(10) == -1) + assert(index.lookup(11) == -1) + assert(index.lookup(12) == 3) + assert(index.lookup(17) == 8) + assert(index.lookup(18) == -1) + + bindex = BlockIndex(20, [5, 12], [3, 6]) + iindex = bindex.to_int_index() + + _check(bindex) + _check(iindex) + + # corner cases + +def test_intersect(): + def _check_correct(a, b, expected): + result = a.intersect(b) + assert(result.equals(expected)) + + def _check_length_exc(a, longer): + nose.tools.assert_raises(Exception, a.intersect, longer) + + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + expected = BlockIndex(TEST_LENGTH, eloc, elen) + longer_index = BlockIndex(TEST_LENGTH + 1, yloc, ylen) + + _check_correct(xindex, yindex, expected) + _check_correct(xindex.to_int_index(), + yindex.to_int_index(), + expected.to_int_index()) + + _check_length_exc(xindex, longer_index) + _check_length_exc(xindex.to_int_index(), + longer_index.to_int_index()) + + check_cases(_check_case) + +class TestBlockIndex(TestCase): + + def test_equals(self): + index = BlockIndex(10, [0, 4], [2, 5]) + + self.assert_(index.equals(index)) + self.assert_(not index.equals(BlockIndex(10, [0, 4], [2, 6]))) + + def test_check_integrity(self): + locs = [] + lengths = [] + + # 0-length OK + index = BlockIndex(0, locs, lengths) + + # also OK even though empty + index = BlockIndex(1, locs, lengths) + + # block extend beyond end + self.assertRaises(Exception, BlockIndex, 10, [5], [10]) + + # block overlap + self.assertRaises(Exception, BlockIndex, 10, [2, 5], [5, 3]) + + def test_to_int_index(self): + locs = [0, 10] + lengths = [4, 6] + exp_inds = [0, 1, 2, 3, 10, 11, 12, 13, 14, 15] + + block = BlockIndex(20, locs, lengths) + dense = block.to_int_index() + + assert_equal(dense.indices, exp_inds) + + def test_to_block_index(self): + index = BlockIndex(10, [0, 5], [4, 5]) + self.assert_(index.to_block_index() is index) + +class TestIntIndex(TestCase): + + def test_equals(self): + index = IntIndex(10, [0, 1, 2, 3, 4]) + self.assert_(index.equals(index)) + self.assert_(not index.equals(IntIndex(10, [0, 1, 2, 3]))) + + def test_to_block_index(self): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + # see if survive the round trip + xbindex = xindex.to_int_index().to_block_index() + ybindex = yindex.to_int_index().to_block_index() + self.assert_(isinstance(xbindex, BlockIndex)) + self.assert_(xbindex.equals(xindex)) + self.assert_(ybindex.equals(yindex)) + check_cases(_check_case) + + def test_to_int_index(self): + index = IntIndex(10, [2, 3, 4, 5, 6]) + self.assert_(index.to_int_index() is index) + +class TestSparseOperators(TestCase): + + def _nan_op_tests(self, sparse_op, python_op): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + xdindex = xindex.to_int_index() + ydindex = yindex.to_int_index() + + x = np.arange(xindex.npoints) * 10. + 1 + y = np.arange(yindex.npoints) * 100. + 1 + + result_block_vals, rb_index = sparse_op(x, xindex, y, yindex) + result_int_vals, ri_index = sparse_op(x, xdindex, y, ydindex) + + self.assert_(rb_index.to_int_index().equals(ri_index)) + assert_equal(result_block_vals, result_int_vals) + + # check versus Series... + xseries = Series(x, xdindex.indices) + yseries = Series(y, ydindex.indices) + series_result = python_op(xseries, yseries).valid() + assert_equal(result_block_vals, series_result.values) + assert_equal(result_int_vals, series_result.values) + + check_cases(_check_case) + + def _op_tests(self, sparse_op, python_op): + def _check_case(xloc, xlen, yloc, ylen, eloc, elen): + xindex = BlockIndex(TEST_LENGTH, xloc, xlen) + yindex = BlockIndex(TEST_LENGTH, yloc, ylen) + + xdindex = xindex.to_int_index() + ydindex = yindex.to_int_index() + + x = np.arange(xindex.npoints) * 10. + 1 + y = np.arange(yindex.npoints) * 100. + 1 + + xfill = 0 + yfill = 2 + + result_block_vals, rb_index = sparse_op(x, xindex, xfill, y, yindex, yfill) + result_int_vals, ri_index = sparse_op(x, xdindex, xfill, + y, ydindex, yfill) + + self.assert_(rb_index.to_int_index().equals(ri_index)) + assert_equal(result_block_vals, result_int_vals) + + # check versus Series... + xseries = Series(x, xdindex.indices) + xseries = xseries.reindex(np.arange(TEST_LENGTH)).fillna(xfill) + + yseries = Series(y, ydindex.indices) + yseries = yseries.reindex(np.arange(TEST_LENGTH)).fillna(yfill) + + series_result = python_op(xseries, yseries) + series_result = series_result.reindex(ri_index.indices) + + assert_equal(result_block_vals, series_result.values) + assert_equal(result_int_vals, series_result.values) + + check_cases(_check_case) + +# too cute? oh but how I abhor code duplication + +check_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] +def make_nanoptestf(op): + def f(self): + sparse_op = getattr(splib, 'sparse_nan%s' % op) + python_op = getattr(operator, op) + self._nan_op_tests(sparse_op, python_op) + f.__name__ = 'test_nan%s' % op + return f + +def make_optestf(op): + def f(self): + sparse_op = getattr(splib, 'sparse_%s' % op) + python_op = getattr(operator, op) + self._op_tests(sparse_op, python_op) + f.__name__ = 'test_%s' % op + return f + +for op in check_ops: + f = make_nanoptestf(op) + g = make_optestf(op) + setattr(TestSparseOperators, f.__name__, f) + setattr(TestSparseOperators, g.__name__, g) + del f + del g + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/sparse/tests/test_list.py b/pandas/sparse/tests/test_list.py new file mode 100644 index 00000000..2b061bad --- /dev/null +++ b/pandas/sparse/tests/test_list.py @@ -0,0 +1,103 @@ +import unittest + +from numpy import nan +import numpy as np + +from pandas.sparse.api import SparseList, SparseArray +from pandas.util.testing import assert_almost_equal + +from test_sparse import assert_sp_array_equal + + +def assert_sp_list_equal(left, right): + assert_sp_array_equal(left.to_array(), right.to_array()) + +class TestSparseList(unittest.TestCase): + + def setUp(self): + self.na_data = np.array([nan, nan, 1, 2, 3, nan, 4, 5, nan, 6]) + self.zero_data = np.array([0, 0, 1, 2, 3, 0, 4, 5, 0, 6]) + + def test_constructor(self): + lst1 = SparseList(self.na_data[:5]) + exp = SparseList() + exp.append(self.na_data[:5]) + assert_sp_list_equal(lst1, exp) + + def test_len(self): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + self.assertEquals(len(splist), 5) + splist.append(arr[5]) + self.assertEquals(len(splist), 6) + splist.append(arr[6:]) + self.assertEquals(len(splist), 10) + + def test_append_na(self): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + sparr = splist.to_array() + assert_sp_array_equal(sparr, SparseArray(arr)) + + def test_append_zero(self): + arr = self.zero_data + splist = SparseList(fill_value=0) + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + sparr = splist.to_array() + assert_sp_array_equal(sparr, SparseArray(arr, fill_value=0)) + + def test_consolidate(self): + arr = self.na_data + exp_sparr = SparseArray(arr) + + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + consol = splist.consolidate(inplace=False) + self.assert_(consol.nchunks == 1) + self.assert_(splist.nchunks == 3) + assert_sp_array_equal(consol.to_array(), exp_sparr) + + splist.consolidate() + self.assert_(splist.nchunks == 1) + assert_sp_array_equal(splist.to_array(), exp_sparr) + + def test_copy(self): + arr = self.na_data + exp_sparr = SparseArray(arr) + + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + + cp = splist.copy() + cp.append(arr[6:]) + self.assertEquals(splist.nchunks, 2) + assert_sp_array_equal(cp.to_array(), exp_sparr) + + def test_getitem(self): + arr = self.na_data + splist = SparseList() + splist.append(arr[:5]) + splist.append(arr[5]) + splist.append(arr[6:]) + + for i in range(len(arr)): + assert_almost_equal(splist[i], arr[i]) + assert_almost_equal(splist[-i], arr[-i]) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/sparse/tests/test_sparse.py b/pandas/sparse/tests/test_sparse.py new file mode 100644 index 00000000..7e19b732 --- /dev/null +++ b/pandas/sparse/tests/test_sparse.py @@ -0,0 +1,1524 @@ +# pylint: disable-msg=E1101,W0612 + +from unittest import TestCase +import cPickle as pickle +import operator + +import nose + +from numpy import nan +import numpy as np +dec = np.testing.dec + +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal, assert_panel_equal) +from numpy.testing import assert_equal + +from pandas import Series, DataFrame, bdate_range, Panel +from pandas.core.datetools import BDay +from pandas.core.index import Index +from pandas.tseries.index import DatetimeIndex +import pandas.core.datetools as datetools +import pandas.util.testing as tm + +import pandas.sparse.frame as spf + +from pandas._sparse import BlockIndex, IntIndex +from pandas.sparse.api import (SparseSeries, SparseTimeSeries, + SparseDataFrame, SparsePanel, + SparseArray) + +import pandas.tests.test_frame as test_frame +import pandas.tests.test_panel as test_panel +import pandas.tests.test_series as test_series + +from test_array import assert_sp_array_equal + +def _test_data1(): + # nan-based + arr = np.arange(20, dtype=float) + index = np.arange(20) + arr[:2] = nan + arr[5:10] = nan + arr[-3:] = nan + + return arr, index + +def _test_data2(): + # nan-based + arr = np.arange(15, dtype=float) + index = np.arange(15) + arr[7:12] = nan + arr[-1:] = nan + return arr, index + +def _test_data1_zero(): + # zero-based + arr, index = _test_data1() + arr[np.isnan(arr)] = 0 + return arr, index + +def _test_data2_zero(): + # zero-based + arr, index = _test_data2() + arr[np.isnan(arr)] = 0 + return arr, index + +def assert_sp_series_equal(a, b): + assert(a.index.equals(b.index)) + assert_sp_array_equal(a, b) + + +def assert_sp_frame_equal(left, right, exact_indices=True): + """ + exact: Series SparseIndex objects must be exactly the same, otherwise just + compare dense representations + """ + for col, series in left.iteritems(): + assert(col in right) + # trade-off? + + if exact_indices: + assert_sp_series_equal(series, right[col]) + else: + assert_series_equal(series.to_dense(), right[col].to_dense()) + + assert_almost_equal(left.default_fill_value, + right.default_fill_value) + + # do I care? + # assert(left.default_kind == right.default_kind) + + for col in right: + assert(col in left) + +def assert_sp_panel_equal(left, right, exact_indices=True): + for item, frame in left.iterkv(): + assert(item in right) + # trade-off? + assert_sp_frame_equal(frame, right[item], exact_indices=exact_indices) + + assert_almost_equal(left.default_fill_value, + right.default_fill_value) + assert(left.default_kind == right.default_kind) + + for item in right: + assert(item in left) + +class TestSparseSeries(TestCase, + test_series.CheckNameIntegration): + + def setUp(self): + arr, index = _test_data1() + + date_index = bdate_range('1/1/2011', periods=len(index)) + + self.bseries = SparseSeries(arr, index=index, kind='block') + self.bseries.name = 'bseries' + + self.ts = self.bseries + + self.btseries = SparseSeries(arr, index=date_index, kind='block') + + self.iseries = SparseSeries(arr, index=index, kind='integer') + + arr, index = _test_data2() + self.bseries2 = SparseSeries(arr, index=index, kind='block') + self.iseries2 = SparseSeries(arr, index=index, kind='integer') + + arr, index = _test_data1_zero() + self.zbseries = SparseSeries(arr, index=index, kind='block', + fill_value=0) + self.ziseries = SparseSeries(arr, index=index, kind='integer', + fill_value=0) + + arr, index = _test_data2_zero() + self.zbseries2 = SparseSeries(arr, index=index, kind='block', + fill_value=0) + self.ziseries2 = SparseSeries(arr, index=index, kind='integer', + fill_value=0) + + def test_construct_DataFrame_with_sp_series(self): + # it works! + df = DataFrame({'col' : self.bseries}) + + def test_sparse_to_dense(self): + arr, index = _test_data1() + series = self.bseries.to_dense() + assert_equal(series, arr) + + series = self.bseries.to_dense(sparse_only=True) + assert_equal(series, arr[np.isfinite(arr)]) + + series = self.iseries.to_dense() + assert_equal(series, arr) + + arr, index = _test_data1_zero() + series = self.zbseries.to_dense() + assert_equal(series, arr) + + series = self.ziseries.to_dense() + assert_equal(series, arr) + + def test_dense_to_sparse(self): + series = self.bseries.to_dense() + bseries = series.to_sparse(kind='block') + iseries = series.to_sparse(kind='integer') + assert_sp_series_equal(bseries, self.bseries) + assert_sp_series_equal(iseries, self.iseries) + + # non-NaN fill value + series = self.zbseries.to_dense() + zbseries = series.to_sparse(kind='block', fill_value=0) + ziseries = series.to_sparse(kind='integer', fill_value=0) + assert_sp_series_equal(zbseries, self.zbseries) + assert_sp_series_equal(ziseries, self.ziseries) + + def test_to_dense_preserve_name(self): + assert(self.bseries.name is not None) + result = self.bseries.to_dense() + self.assertEquals(result.name, self.bseries.name) + + def test_constructor(self): + # test setup guys + self.assert_(np.isnan(self.bseries.fill_value)) + self.assert_(isinstance(self.bseries.sp_index, BlockIndex)) + self.assert_(np.isnan(self.iseries.fill_value)) + self.assert_(isinstance(self.iseries.sp_index, IntIndex)) + + self.assertEquals(self.zbseries.fill_value, 0) + assert_equal(self.zbseries.values, self.bseries.to_dense().fillna(0)) + + # pass SparseSeries + s2 = SparseSeries(self.bseries) + s3 = SparseSeries(self.iseries) + s4 = SparseSeries(self.zbseries) + assert_sp_series_equal(s2, self.bseries) + assert_sp_series_equal(s3, self.iseries) + assert_sp_series_equal(s4, self.zbseries) + + # Sparse time series works + date_index = bdate_range('1/1/2000', periods=len(self.bseries)) + s5 = SparseSeries(self.bseries, index=date_index) + self.assert_(isinstance(s5, SparseTimeSeries)) + + # pass Series + bseries2 = SparseSeries(self.bseries.to_dense()) + assert_equal(self.bseries.sp_values, bseries2.sp_values) + + # pass dict? + + # don't copy the data by default + values = np.ones(len(self.bseries.sp_values)) + sp = SparseSeries(values, sparse_index=self.bseries.sp_index) + sp.sp_values[:5] = 97 + self.assert_(values[0] == 97) + + # but can make it copy! + sp = SparseSeries(values, sparse_index=self.bseries.sp_index, + copy=True) + sp.sp_values[:5] = 100 + self.assert_(values[0] == 97) + + def test_constructor_ndarray(self): + pass + + def test_constructor_nonnan(self): + arr = [0, 0, 0, nan, nan] + sp_series = SparseSeries(arr, fill_value=0) + assert_equal(sp_series.values, arr) + + def test_copy_astype(self): + cop = self.bseries.astype(np.float_) + self.assert_(cop is not self.bseries) + self.assert_(cop.sp_index is self.bseries.sp_index) + self.assert_(cop.dtype == np.float64) + + cop2 = self.iseries.copy() + + assert_sp_series_equal(cop, self.bseries) + assert_sp_series_equal(cop2, self.iseries) + + # test that data is copied + cop.sp_values[:5] = 97 + self.assert_(cop.sp_values[0] == 97) + self.assert_(self.bseries.sp_values[0] != 97) + + # correct fill value + zbcop = self.zbseries.copy() + zicop = self.ziseries.copy() + + assert_sp_series_equal(zbcop, self.zbseries) + assert_sp_series_equal(zicop, self.ziseries) + + # no deep copy + view = self.bseries.copy(deep=False) + view.sp_values[:5] = 5 + self.assert_((self.bseries.sp_values[:5] == 5).all()) + + def test_astype(self): + self.assertRaises(Exception, self.bseries.astype, np.int64) + + def test_kind(self): + self.assertEquals(self.bseries.kind, 'block') + self.assertEquals(self.iseries.kind, 'integer') + + def test_pickle(self): + def _test_roundtrip(series): + pickled = pickle.dumps(series, protocol=pickle.HIGHEST_PROTOCOL) + unpickled = pickle.loads(pickled) + assert_sp_series_equal(series, unpickled) + assert_series_equal(series.to_dense(), unpickled.to_dense()) + + self._check_all(_test_roundtrip) + + def _check_all(self, check_func): + check_func(self.bseries) + check_func(self.iseries) + check_func(self.zbseries) + check_func(self.ziseries) + + def test_getitem(self): + def _check_getitem(sp, dense): + for idx, val in dense.iteritems(): + assert_almost_equal(val, sp[idx]) + + for i in xrange(len(dense)): + assert_almost_equal(sp[i], dense[i]) + # j = np.float64(i) + # assert_almost_equal(sp[j], dense[j]) + + # API change 1/6/2012 + # negative getitem works + # for i in xrange(len(dense)): + # assert_almost_equal(sp[-i], dense[-i]) + + _check_getitem(self.bseries, self.bseries.to_dense()) + _check_getitem(self.btseries, self.btseries.to_dense()) + + _check_getitem(self.zbseries, self.zbseries.to_dense()) + _check_getitem(self.iseries, self.iseries.to_dense()) + _check_getitem(self.ziseries, self.ziseries.to_dense()) + + # exception handling + self.assertRaises(Exception, self.bseries.__getitem__, + len(self.bseries) + 1) + + # index not contained + self.assertRaises(Exception, self.btseries.__getitem__, + self.btseries.index[-1] + BDay()) + + def test_get_get_value(self): + assert_almost_equal(self.bseries.get(10), self.bseries[10]) + self.assert_(self.bseries.get(len(self.bseries) + 1) is None) + + dt = self.btseries.index[10] + result = self.btseries.get(dt) + expected = self.btseries.to_dense()[dt] + assert_almost_equal(result, expected) + + assert_almost_equal(self.bseries.get_value(10), self.bseries[10]) + + def test_set_value(self): + idx = self.btseries.index[7] + res = self.btseries.set_value(idx, 0) + self.assert_(res is not self.btseries) + self.assertEqual(res[idx], 0) + + res = self.iseries.set_value('foobar', 0) + self.assert_(res is not self.iseries) + self.assert_(res.index[-1] == 'foobar') + self.assertEqual(res['foobar'], 0) + + def test_getitem_slice(self): + idx = self.bseries.index + res = self.bseries[::2] + self.assert_(isinstance(res, SparseSeries)) + assert_sp_series_equal(res, self.bseries.reindex(idx[::2])) + + res = self.bseries[:5] + self.assert_(isinstance(res, SparseSeries)) + assert_sp_series_equal(res, self.bseries.reindex(idx[:5])) + + res = self.bseries[5:] + assert_sp_series_equal(res, self.bseries.reindex(idx[5:])) + + # negative indices + res = self.bseries[:-3] + assert_sp_series_equal(res, self.bseries.reindex(idx[:-3])) + + def test_take(self): + def _compare_with_dense(sp): + dense = sp.to_dense() + + def _compare(idx): + dense_result = dense.take(idx).values + sparse_result = sp.take(idx) + self.assert_(isinstance(sparse_result, SparseSeries)) + assert_almost_equal(dense_result, sparse_result.values) + + _compare([1., 2., 3., 4., 5., 0.]) + _compare([7, 2, 9, 0, 4]) + _compare([3, 6, 3, 4, 7]) + + self._check_all(_compare_with_dense) + + self.assertRaises(Exception, self.bseries.take, [-1, 0]) + self.assertRaises(Exception, self.bseries.take, + [0, len(self.bseries) + 1]) + + # Corner case + sp = SparseSeries(np.ones(10.) * nan) + assert_almost_equal(sp.take([0, 1, 2, 3, 4]), np.repeat(nan, 5)) + + def test_setitem(self): + self.assertRaises(Exception, self.bseries.__setitem__, 5, 7.) + self.assertRaises(Exception, self.iseries.__setitem__, 5, 7.) + + def test_setslice(self): + self.assertRaises(Exception, self.bseries.__setslice__, 5, 10, 7.) + + def test_operators(self): + def _check_op(a, b, op): + sp_result = op(a, b) + adense = a.to_dense() if isinstance(a, SparseSeries) else a + bdense = b.to_dense() if isinstance(b, SparseSeries) else b + dense_result = op(adense, bdense) + assert_almost_equal(sp_result.to_dense(), dense_result) + + def check(a, b): + _check_op(a, b, operator.add) + _check_op(a, b, operator.sub) + _check_op(a, b, operator.truediv) + _check_op(a, b, operator.floordiv) + _check_op(a, b, operator.mul) + + _check_op(a, b, lambda x, y: operator.add(y, x)) + _check_op(a, b, lambda x, y: operator.sub(y, x)) + _check_op(a, b, lambda x, y: operator.truediv(y, x)) + _check_op(a, b, lambda x, y: operator.floordiv(y, x)) + _check_op(a, b, lambda x, y: operator.mul(y, x)) + + # NaN ** 0 = 1 in C? + # _check_op(a, b, operator.pow) + # _check_op(a, b, lambda x, y: operator.pow(y, x)) + + check(self.bseries, self.bseries) + check(self.iseries, self.iseries) + check(self.bseries, self.iseries) + + check(self.bseries, self.bseries2) + check(self.bseries, self.iseries2) + check(self.iseries, self.iseries2) + + # scalar value + check(self.bseries, 5) + + # zero-based + check(self.zbseries, self.zbseries * 2) + check(self.zbseries, self.zbseries2) + check(self.ziseries, self.ziseries2) + + # with dense + result = self.bseries + self.bseries.to_dense() + assert_sp_series_equal(result, self.bseries + self.bseries) + + # @dec.knownfailureif(True, 'Known NumPy failer as of 1.5.1') + def test_operators_corner2(self): + raise nose.SkipTest('known failer on numpy 1.5.1') + + # NumPy circumvents __r*__ operations + val = np.float64(3.0) + result = val - self.zbseries + assert_sp_series_equal(result, 3 - self.zbseries) + + def test_binary_operators(self): + def _check_inplace_op(op): + tmp = self.bseries.copy() + self.assertRaises(NotImplementedError, op, tmp, self.bseries) + inplace_ops = ['iadd', 'isub', 'imul', 'itruediv', 'ifloordiv', 'ipow'] + for op in inplace_ops: + _check_inplace_op(getattr(operator, op)) + + def test_reindex(self): + def _compare_with_series(sps, new_index): + spsre = sps.reindex(new_index) + + series = sps.to_dense() + seriesre = series.reindex(new_index) + seriesre = seriesre.to_sparse(fill_value=sps.fill_value) + + assert_sp_series_equal(spsre, seriesre) + assert_series_equal(spsre.to_dense(), seriesre.to_dense()) + + _compare_with_series(self.bseries, self.bseries.index[::2]) + _compare_with_series(self.bseries, list(self.bseries.index[::2])) + _compare_with_series(self.bseries, self.bseries.index[:10]) + _compare_with_series(self.bseries, self.bseries.index[5:]) + + _compare_with_series(self.zbseries, self.zbseries.index[::2]) + _compare_with_series(self.zbseries, self.zbseries.index[:10]) + _compare_with_series(self.zbseries, self.zbseries.index[5:]) + + # special cases + same_index = self.bseries.reindex(self.bseries.index) + assert_sp_series_equal(self.bseries, same_index) + self.assert_(same_index is not self.bseries) + + # corner cases + sp = SparseSeries([], index=[]) + sp_zero = SparseSeries([], index=[], fill_value=0) + _compare_with_series(sp, np.arange(10)) + + # with copy=False + reindexed = self.bseries.reindex(self.bseries.index, copy=True) + reindexed.sp_values[:] = 1. + self.assert_((self.bseries.sp_values != 1.).all()) + + reindexed = self.bseries.reindex(self.bseries.index, copy=False) + reindexed.sp_values[:] = 1. + self.assert_((self.bseries.sp_values == 1.).all()) + + def test_sparse_reindex(self): + length = 10 + + def _check(values, index1, index2, fill_value): + first_series = SparseSeries(values, sparse_index=index1, + fill_value=fill_value) + reindexed = first_series.sparse_reindex(index2) + self.assert_(reindexed.sp_index is index2) + + int_indices1 = index1.to_int_index().indices + int_indices2 = index2.to_int_index().indices + + expected = Series(values, index=int_indices1) + expected = expected.reindex(int_indices2).fillna(fill_value) + assert_almost_equal(expected.values, reindexed.sp_values) + + # make sure level argument asserts + expected = expected.reindex(int_indices2).fillna(fill_value) + + def _check_with_fill_value(values, first, second, fill_value=nan): + i_index1 = IntIndex(length, first) + i_index2 = IntIndex(length, second) + + b_index1 = i_index1.to_block_index() + b_index2 = i_index2.to_block_index() + + _check(values, i_index1, i_index2, fill_value) + _check(values, b_index1, b_index2, fill_value) + + def _check_all(values, first, second): + _check_with_fill_value(values, first, second, fill_value=nan) + _check_with_fill_value(values, first, second, fill_value=0) + + index1 = [2, 4, 5, 6, 8, 9] + values1 = np.arange(6.) + + _check_all(values1, index1, [2, 4, 5]) + _check_all(values1, index1, [2, 3, 4, 5, 6, 7, 8, 9]) + _check_all(values1, index1, [0, 1]) + _check_all(values1, index1, [0, 1, 7, 8, 9]) + _check_all(values1, index1, []) + + def test_repr(self): + bsrepr = repr(self.bseries) + isrepr = repr(self.iseries) + + def test_iter(self): + pass + + def test_truncate(self): + pass + + def test_fillna(self): + pass + + def test_groupby(self): + pass + + def test_reductions(self): + def _compare_with_dense(obj, op): + sparse_result = getattr(obj, op)() + series = obj.to_dense() + dense_result = getattr(series, op)() + self.assertEquals(sparse_result, dense_result) + + to_compare = ['count', 'sum', 'mean', 'std', 'var', 'skew'] + def _compare_all(obj): + for op in to_compare: + _compare_with_dense(obj, op) + + _compare_all(self.bseries) + + self.bseries.sp_values[5:10] = np.NaN + _compare_all(self.bseries) + + _compare_all(self.zbseries) + self.zbseries.sp_values[5:10] = np.NaN + _compare_all(self.zbseries) + + series = self.zbseries.copy() + series.fill_value = 2 + _compare_all(series) + + nonna = Series(np.random.randn(20)).to_sparse() + _compare_all(nonna) + + nonna2 = Series(np.random.randn(20)).to_sparse(fill_value=0) + _compare_all(nonna2) + + def test_dropna(self): + sp = SparseSeries([0, 0, 0, nan, nan, 5, 6], + fill_value=0) + + sp_valid = sp.valid() + assert_almost_equal(sp_valid.values, + sp.to_dense().valid().values) + self.assert_(sp_valid.index.equals(sp.to_dense().valid().index)) + self.assertEquals(len(sp_valid.sp_values), 2) + + result = self.bseries.dropna() + expected = self.bseries.to_dense().dropna() + self.assert_(not isinstance(result, SparseSeries)) + tm.assert_series_equal(result, expected) + + def test_homogenize(self): + def _check_matches(indices, expected): + data = {} + for i, idx in enumerate(indices): + data[i] = SparseSeries(idx.to_int_index().indices, + sparse_index=idx) + homogenized = spf.homogenize(data) + + for k, v in homogenized.iteritems(): + assert(v.sp_index.equals(expected)) + + indices1 = [BlockIndex(10, [2], [7]), + BlockIndex(10, [1, 6], [3, 4]), + BlockIndex(10, [0], [10])] + expected1 = BlockIndex(10, [2, 6], [2, 3]) + _check_matches(indices1, expected1) + + indices2 = [BlockIndex(10, [2], [7]), + BlockIndex(10, [2], [7])] + expected2 = indices2[0] + _check_matches(indices2, expected2) + + # must have NaN fill value + data = {'a' : SparseSeries(np.arange(7), sparse_index=expected2, + fill_value=0)} + nose.tools.assert_raises(Exception, spf.homogenize, data) + + def test_fill_value_corner(self): + cop = self.zbseries.copy() + cop.fill_value = 0 + result = self.bseries / cop + + self.assert_(np.isnan(result.fill_value)) + + cop2 = self.zbseries.copy() + cop2.fill_value = 1 + result = cop2 / cop + self.assert_(np.isnan(result.fill_value)) + + def test_shift(self): + series = SparseSeries([nan, 1., 2., 3., nan, nan], + index=np.arange(6)) + + shifted = series.shift(0) + self.assert_(shifted is not series) + assert_sp_series_equal(shifted, series) + + f = lambda s: s.shift(1) + _dense_series_compare(series, f) + + f = lambda s: s.shift(-2) + _dense_series_compare(series, f) + + series = SparseSeries([nan, 1., 2., 3., nan, nan], + index=bdate_range('1/1/2000', periods=6)) + f = lambda s: s.shift(2, freq='B') + _dense_series_compare(series, f) + + f = lambda s: s.shift(2, freq=datetools.bday) + _dense_series_compare(series, f) + + def test_cumsum(self): + result = self.bseries.cumsum() + expected = self.bseries.to_dense().cumsum() + self.assert_(isinstance(result, SparseSeries)) + self.assertEquals(result.name, self.bseries.name) + assert_series_equal(result.to_dense(), expected) + + result = self.zbseries.cumsum() + expected = self.zbseries.to_dense().cumsum() + self.assert_(isinstance(result, Series)) + assert_series_equal(result, expected) + + def test_combine_first(self): + s = self.bseries + + result = s[::2].combine_first(s) + result2 = s[::2].combine_first(s.to_dense()) + + expected = s[::2].to_dense().combine_first(s.to_dense()) + expected = expected.to_sparse(fill_value=s.fill_value) + + assert_sp_series_equal(result, result2) + assert_sp_series_equal(result, expected) + +class TestSparseTimeSeries(TestCase): + pass + +class TestSparseDataFrame(TestCase, test_frame.SafeForSparse): + klass = SparseDataFrame + + def setUp(self): + self.data = {'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B' : [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C' : np.arange(10), + 'D' : [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]} + + self.dates = bdate_range('1/1/2011', periods=10) + + self.frame = SparseDataFrame(self.data, index=self.dates) + self.iframe = SparseDataFrame(self.data, index=self.dates, + default_kind='integer') + + values = self.frame.values.copy() + values[np.isnan(values)] = 0 + + self.zframe = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=0, + index=self.dates) + + values = self.frame.values.copy() + values[np.isnan(values)] = 2 + self.fill_frame = SparseDataFrame(values, columns=['A', 'B', 'C', 'D'], + default_fill_value=2, + index=self.dates) + + self.empty = SparseDataFrame() + + def test_as_matrix(self): + empty = self.empty.as_matrix() + self.assert_(empty.shape == (0, 0)) + + no_cols = SparseDataFrame(index=np.arange(10)) + mat = no_cols.as_matrix() + self.assert_(mat.shape == (10, 0)) + + no_index = SparseDataFrame(columns=np.arange(10)) + mat = no_index.as_matrix() + self.assert_(mat.shape == (0, 10)) + + def test_copy(self): + cp = self.frame.copy() + self.assert_(isinstance(cp, SparseDataFrame)) + assert_sp_frame_equal(cp, self.frame) + self.assert_(cp.index is self.frame.index) + + def test_constructor(self): + for col, series in self.frame.iteritems(): + self.assert_(isinstance(series, SparseSeries)) + + self.assert_(isinstance(self.iframe['A'].sp_index, IntIndex)) + + # constructed zframe from matrix above + self.assertEquals(self.zframe['A'].fill_value, 0) + assert_almost_equal([0, 0, 0, 0, 1, 2, 3, 4, 5, 6], + self.zframe['A'].values) + + # construct from nested dict + data = {} + for c, s in self.frame.iteritems(): + data[c] = s.to_dict() + + sdf = SparseDataFrame(data) + assert_sp_frame_equal(sdf, self.frame) + + # TODO: test data is copied from inputs + + # init dict with different index + idx = self.frame.index[:5] + cons = SparseDataFrame(self.frame._series, index=idx, + columns=self.frame.columns, + default_fill_value=self.frame.default_fill_value, + default_kind=self.frame.default_kind) + reindexed = self.frame.reindex(idx) + assert_sp_frame_equal(cons, reindexed) + + # assert level parameter breaks reindex + self.assertRaises(Exception, self.frame.reindex, idx, level=0) + + repr(self.frame) + + def test_constructor_ndarray(self): + # no index or columns + sp = SparseDataFrame(self.frame.values) + + # 1d + sp = SparseDataFrame(self.data['A'], index=self.dates, + columns=['A']) + assert_sp_frame_equal(sp, self.frame.reindex(columns=['A'])) + + # raise on level argument + self.assertRaises(Exception, self.frame.reindex, columns=['A'], + level=1) + + # wrong length index / columns + self.assertRaises(Exception, SparseDataFrame, self.frame.values, + index=self.frame.index[:-1]) + self.assertRaises(Exception, SparseDataFrame, self.frame.values, + columns=self.frame.columns[:-1]) + + def test_constructor_empty(self): + sp = SparseDataFrame() + self.assert_(len(sp.index) == 0) + self.assert_(len(sp.columns) == 0) + + def test_constructor_dataframe(self): + dense = self.frame.to_dense() + sp = SparseDataFrame(dense) + assert_sp_frame_equal(sp, self.frame) + + def test_array_interface(self): + res = np.sqrt(self.frame) + dres = np.sqrt(self.frame.to_dense()) + assert_frame_equal(res.to_dense(), dres) + + def test_pickle(self): + def _test_roundtrip(frame): + pickled = pickle.dumps(frame, protocol=pickle.HIGHEST_PROTOCOL) + unpickled = pickle.loads(pickled) + assert_sp_frame_equal(frame, unpickled) + + _test_roundtrip(SparseDataFrame()) + self._check_all(_test_roundtrip) + + def test_dense_to_sparse(self): + df = DataFrame({'A' : [nan, nan, nan, 1, 2], + 'B' : [1, 2, nan, nan, nan]}) + sdf = df.to_sparse() + self.assert_(isinstance(sdf, SparseDataFrame)) + self.assert_(np.isnan(sdf.default_fill_value)) + self.assert_(isinstance(sdf['A'].sp_index, BlockIndex)) + tm.assert_frame_equal(sdf.to_dense(), df) + + sdf = df.to_sparse(kind='integer') + self.assert_(isinstance(sdf['A'].sp_index, IntIndex)) + + df = DataFrame({'A' : [0, 0, 0, 1, 2], + 'B' : [1, 2, 0, 0, 0]}, dtype=float) + sdf = df.to_sparse(fill_value=0) + self.assertEquals(sdf.default_fill_value, 0) + tm.assert_frame_equal(sdf.to_dense(), df) + + def test_sparse_to_dense(self): + pass + + def test_sparse_series_ops(self): + self._check_all(self._check_frame_ops) + + def _check_frame_ops(self, frame): + fill = frame.default_fill_value + + def _compare_to_dense(a, b, da, db, op): + sparse_result = op(a, b) + dense_result = op(da, db) + + dense_result = dense_result.to_sparse(fill_value=fill) + assert_sp_frame_equal(sparse_result, dense_result, + exact_indices=False) + + if isinstance(a, DataFrame) and isinstance(db, DataFrame): + mixed_result = op(a, db) + self.assert_(isinstance(mixed_result, SparseDataFrame)) + assert_sp_frame_equal(mixed_result, sparse_result, + exact_indices=False) + + opnames = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + ops = [getattr(operator, name) for name in opnames] + + fidx = frame.index + + # time series operations + + series = [frame['A'], frame['B'], + frame['C'], frame['D'], + frame['A'].reindex(fidx[:7]), + frame['A'].reindex(fidx[::2]), + SparseSeries([], index=[])] + + for op in ops: + _compare_to_dense(frame, frame[::2], frame.to_dense(), + frame[::2].to_dense(), op) + for s in series: + _compare_to_dense(frame, s, frame.to_dense(), + s.to_dense(), op) + _compare_to_dense(s, frame, s.to_dense(), + frame.to_dense(), op) + + # cross-sectional operations + series = [frame.xs(fidx[0]), + frame.xs(fidx[3]), + frame.xs(fidx[5]), + frame.xs(fidx[7]), + frame.xs(fidx[5])[:2]] + + for op in ops: + for s in series: + _compare_to_dense(frame, s, frame.to_dense(), + s, op) + _compare_to_dense(s, frame, s, + frame.to_dense(), op) + + # it works! + result = self.frame + self.frame.ix[:, ['A', 'B']] + + def test_op_corners(self): + empty = self.empty + self.empty + self.assert_(empty.empty) + + foo = self.frame + self.empty + self.assert_(isinstance(foo.index, DatetimeIndex)) + assert_frame_equal(foo, self.frame * np.nan) + + foo = self.empty + self.frame + assert_frame_equal(foo, self.frame * np.nan) + + def test_scalar_ops(self): + pass + + def test_getitem(self): + pass + + def test_set_value(self): + res = self.frame.set_value('foobar', 'B', 1.5) + self.assert_(res is not self.frame) + self.assert_(res.index[-1] == 'foobar') + self.assertEqual(res.get_value('foobar', 'B'), 1.5) + + res2 = res.set_value('foobar', 'qux', 1.5) + self.assert_(res2 is not res) + self.assert_(np.array_equal(res2.columns, + list(self.frame.columns) + ['qux'])) + self.assertEqual(res2.get_value('foobar', 'qux'), 1.5) + + def test_fancy_index_misc(self): + # axis = 0 + sliced = self.frame.ix[-2:, :] + expected = self.frame.reindex(index=self.frame.index[-2:]) + assert_sp_frame_equal(sliced, expected) + + # axis = 1 + sliced = self.frame.ix[:, -2:] + expected = self.frame.reindex(columns=self.frame.columns[-2:]) + assert_sp_frame_equal(sliced, expected) + + def test_getitem_overload(self): + # slicing + sl = self.frame[:20] + assert_sp_frame_equal(sl, self.frame.reindex(self.frame.index[:20])) + + # boolean indexing + d = self.frame.index[5] + indexer = self.frame.index > d + + subindex = self.frame.index[indexer] + subframe = self.frame[indexer] + + self.assert_(np.array_equal(subindex, subframe.index)) + self.assertRaises(Exception, self.frame.__getitem__, indexer[:-1]) + + def test_setitem(self): + def _check_frame(frame): + N = len(frame) + + # insert SparseSeries + frame['E'] = frame['A'] + self.assert_(isinstance(frame['E'], SparseSeries)) + assert_sp_series_equal(frame['E'], frame['A']) + + # insert SparseSeries differently-indexed + to_insert = frame['A'][::2] + frame['E'] = to_insert + assert_series_equal(frame['E'].to_dense(), + to_insert.to_dense().reindex(frame.index)) + + # insert Series + frame['F'] = frame['A'].to_dense() + self.assert_(isinstance(frame['F'], SparseSeries)) + assert_sp_series_equal(frame['F'], frame['A']) + + # insert Series differently-indexed + to_insert = frame['A'].to_dense()[::2] + frame['G'] = to_insert + assert_series_equal(frame['G'].to_dense(), + to_insert.reindex(frame.index)) + + # insert ndarray + frame['H'] = np.random.randn(N) + self.assert_(isinstance(frame['H'], SparseSeries)) + + to_sparsify = np.random.randn(N) + to_sparsify[N // 2:] = frame.default_fill_value + frame['I'] = to_sparsify + self.assertEquals(len(frame['I'].sp_values), N // 2) + + # insert ndarray wrong size + self.assertRaises(Exception, frame.__setitem__, 'foo', + np.random.randn(N - 1)) + + # scalar value + frame['J'] = 5 + self.assertEquals(len(frame['J'].sp_values), N) + self.assert_((frame['J'].sp_values == 5).all()) + + frame['K'] = frame.default_fill_value + self.assertEquals(len(frame['K'].sp_values), 0) + + + self._check_all(_check_frame) + + def test_setitem_corner(self): + self.frame['a'] = self.frame['B'] + assert_sp_series_equal(self.frame['a'], self.frame['B']) + + def test_setitem_array(self): + arr = self.frame['B'].view(SparseArray) + + self.frame['E'] = arr + assert_sp_series_equal(self.frame['E'], self.frame['B']) + self.assertRaises(Exception, self.frame.__setitem__, 'F', arr[:-1]) + + def test_delitem(self): + A = self.frame['A'] + C = self.frame['C'] + + del self.frame['B'] + self.assert_('B' not in self.frame) + assert_sp_series_equal(self.frame['A'], A) + assert_sp_series_equal(self.frame['C'], C) + + del self.frame['D'] + self.assert_('D' not in self.frame) + + del self.frame['A'] + self.assert_('A' not in self.frame) + + def test_set_columns(self): + self.frame.columns = self.frame.columns + self.assertRaises(Exception, setattr, self.frame, 'columns', + self.frame.columns[:-1]) + + def test_set_index(self): + self.frame.index = self.frame.index + self.assertRaises(Exception, setattr, self.frame, 'index', + self.frame.index[:-1]) + + def test_append(self): + a = self.frame[:5] + b = self.frame[5:] + + appended = a.append(b) + assert_sp_frame_equal(appended, self.frame) + + a = self.frame.ix[:5, :3] + b = self.frame.ix[5:] + appended = a.append(b) + assert_sp_frame_equal(appended.ix[:, :3], self.frame.ix[:, :3]) + + def test_apply(self): + applied = self.frame.apply(np.sqrt) + self.assert_(isinstance(applied, SparseDataFrame)) + assert_almost_equal(applied.values, np.sqrt(self.frame.values)) + + applied = self.fill_frame.apply(np.sqrt) + self.assert_(applied['A'].fill_value == np.sqrt(2)) + + # agg / broadcast + applied = self.frame.apply(np.sum) + assert_series_equal(applied, + self.frame.to_dense().apply(np.sum)) + + broadcasted = self.frame.apply(np.sum, broadcast=True) + self.assert_(isinstance(broadcasted, SparseDataFrame)) + assert_frame_equal(broadcasted.to_dense(), + self.frame.to_dense().apply(np.sum, broadcast=True)) + + self.assert_(self.empty.apply(np.sqrt) is self.empty) + + def test_applymap(self): + # just test that it works + result = self.frame.applymap(lambda x: x * 2) + self.assert_(isinstance(result, SparseDataFrame)) + + def test_astype(self): + self.assertRaises(Exception, self.frame.astype, np.int64) + + def test_fillna(self): + df = self.zframe.reindex(range(5)) + result = df.fillna(0) + expected = df.to_dense().fillna(0).to_sparse(fill_value=0) + assert_sp_frame_equal(result, expected) + + result = df.copy() + result.fillna(0, inplace=True) + expected = df.to_dense().fillna(0).to_sparse(fill_value=0) + assert_sp_frame_equal(result, expected) + + result = df.copy() + result = df['A'] + result.fillna(0, inplace=True) + assert_series_equal(result, df['A'].fillna(0)) + + def test_rename(self): + # just check this works + renamed = self.frame.rename(index=str) + renamed = self.frame.rename(columns=lambda x: '%s%d' % (x, len(x))) + + def test_corr(self): + res = self.frame.corr() + assert_frame_equal(res, self.frame.to_dense().corr()) + + def test_describe(self): + self.frame['foo'] = np.nan + desc = self.frame.describe() + + def test_join(self): + left = self.frame.ix[:, ['A', 'B']] + right = self.frame.ix[:, ['C', 'D']] + joined = left.join(right) + assert_sp_frame_equal(joined, self.frame) + + right = self.frame.ix[:, ['B', 'D']] + self.assertRaises(Exception, left.join, right) + + def test_reindex(self): + + def _check_frame(frame): + index = frame.index + sidx = index[::2] + sidx2 = index[:5] + + sparse_result = frame.reindex(sidx) + dense_result = frame.to_dense().reindex(sidx) + assert_frame_equal(sparse_result.to_dense(), dense_result) + + assert_frame_equal(frame.reindex(list(sidx)).to_dense(), + dense_result) + + sparse_result2 = sparse_result.reindex(index) + dense_result2 = dense_result.reindex(index) + assert_frame_equal(sparse_result2.to_dense(), dense_result2) + + # propagate CORRECT fill value + assert_almost_equal(sparse_result.default_fill_value, + frame.default_fill_value) + assert_almost_equal(sparse_result['A'].fill_value, + frame['A'].fill_value) + + # length zero + length_zero = frame.reindex([]) + self.assertEquals(len(length_zero), 0) + self.assertEquals(len(length_zero.columns), len(frame.columns)) + self.assertEquals(len(length_zero['A']), 0) + + # frame being reindexed has length zero + length_n = length_zero.reindex(index) + self.assertEquals(len(length_n), len(frame)) + self.assertEquals(len(length_n.columns), len(frame.columns)) + self.assertEquals(len(length_n['A']), len(frame)) + + # reindex columns + reindexed = frame.reindex(columns=['A', 'B', 'Z']) + self.assertEquals(len(reindexed.columns), 3) + assert_almost_equal(reindexed['Z'].fill_value, + frame.default_fill_value) + self.assert_(np.isnan(reindexed['Z'].sp_values).all()) + + _check_frame(self.frame) + _check_frame(self.iframe) + _check_frame(self.zframe) + _check_frame(self.fill_frame) + + # with copy=False + reindexed = self.frame.reindex(self.frame.index, copy=False) + reindexed['F'] = reindexed['A'] + self.assert_('F' in self.frame) + + reindexed = self.frame.reindex(self.frame.index) + reindexed['G'] = reindexed['A'] + self.assert_('G' not in self.frame) + + def test_reindex_fill_value(self): + rng = bdate_range('20110110', periods=20) + result = self.zframe.reindex(rng, fill_value=0) + expected = self.zframe.reindex(rng).fillna(0) + assert_sp_frame_equal(result, expected) + + def test_take(self): + result = self.frame.take([1, 0, 2], axis=1) + expected = self.frame.reindex(columns=['B', 'A', 'C']) + assert_sp_frame_equal(result, expected) + + def test_density(self): + df = SparseDataFrame({'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B' : [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'C' : np.arange(10), + 'D' : [0, 1, 2, 3, 4, 5, nan, nan, nan, nan]}) + + self.assertEquals(df.density, 0.75) + + def test_to_dense(self): + def _check(frame): + dense_dm = frame.to_dense() + assert_frame_equal(frame, dense_dm) + + self._check_all(_check) + + def test_stack_sparse_frame(self): + def _check(frame): + dense_frame = frame.to_dense() + + wp = Panel.from_dict({'foo' : frame}) + from_dense_lp = wp.to_frame() + + from_sparse_lp = spf.stack_sparse_frame(frame) + + self.assert_(np.array_equal(from_dense_lp.values, + from_sparse_lp.values)) + + + _check(self.frame) + _check(self.iframe) + + # for now + self.assertRaises(Exception, _check, self.zframe) + self.assertRaises(Exception, _check, self.fill_frame) + + def test_transpose(self): + def _check(frame): + transposed = frame.T + untransposed = transposed.T + assert_sp_frame_equal(frame, untransposed) + self._check_all(_check) + + def test_shift(self): + def _check(frame): + shifted = frame.shift(0) + self.assert_(shifted is not frame) + assert_sp_frame_equal(shifted, frame) + + f = lambda s: s.shift(1) + _dense_frame_compare(frame, f) + + f = lambda s: s.shift(-2) + _dense_frame_compare(frame, f) + + f = lambda s: s.shift(2, freq='B') + _dense_frame_compare(frame, f) + + f = lambda s: s.shift(2, freq=datetools.bday) + _dense_frame_compare(frame, f) + + self._check_all(_check) + + def test_count(self): + result = self.frame.count() + dense_result = self.frame.to_dense().count() + assert_series_equal(result, dense_result) + + result = self.frame.count(1) + dense_result = self.frame.to_dense().count(1) + + # win32 don't check dtype + assert_series_equal(result, dense_result, check_dtype=False) + + def test_cumsum(self): + result = self.frame.cumsum() + expected = self.frame.to_dense().cumsum() + self.assert_(isinstance(result, SparseDataFrame)) + assert_frame_equal(result.to_dense(), expected) + + def _check_all(self, check_func): + check_func(self.frame) + check_func(self.iframe) + check_func(self.zframe) + check_func(self.fill_frame) + + def test_combine_first(self): + df = self.frame + + result = df[::2].combine_first(df) + result2 = df[::2].combine_first(df.to_dense()) + + expected = df[::2].to_dense().combine_first(df.to_dense()) + expected = expected.to_sparse(fill_value=df.default_fill_value) + + assert_sp_frame_equal(result, result2) + assert_sp_frame_equal(result, expected) + + def test_combine_add(self): + df = self.frame.to_dense() + df2 = df.copy() + df2['C'][:3] = np.nan + df['A'][:3] = 5.7 + + result = df.to_sparse().add(df2.to_sparse(), fill_value=0) + expected = df.add(df2, fill_value=0).to_sparse() + assert_sp_frame_equal(result, expected) + + def test_isin(self): + sparse_df = DataFrame({'flag': [1., 0., 1.]}).to_sparse(fill_value=0.) + xp = sparse_df[sparse_df.flag == 1.] + rs = sparse_df[sparse_df.flag.isin([1.])] + assert_frame_equal(xp, rs) + +def _dense_series_compare(s, f): + result = f(s) + assert(isinstance(result, SparseSeries)) + dense_result = f(s.to_dense()) + assert_series_equal(result.to_dense(), dense_result) + +def _dense_frame_compare(frame, f): + result = f(frame) + assert(isinstance(frame, SparseDataFrame)) + dense_result = f(frame.to_dense()) + assert_frame_equal(result.to_dense(), dense_result) + +def panel_data1(): + index = bdate_range('1/1/2011', periods=8) + + return DataFrame({ + 'A' : [nan, nan, nan, 0, 1, 2, 3, 4], + 'B' : [0, 1, 2, 3, 4, nan, nan, nan], + 'C' : [0, 1, 2, nan, nan, nan, 3, 4], + 'D' : [nan, 0, 1, nan, 2, 3, 4, nan] + }, index=index) + + +def panel_data2(): + index = bdate_range('1/1/2011', periods=9) + + return DataFrame({ + 'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5], + 'B' : [0, 1, 2, 3, 4, 5, nan, nan, nan], + 'C' : [0, 1, 2, nan, nan, nan, 3, 4, 5], + 'D' : [nan, 0, 1, nan, 2, 3, 4, 5, nan] + }, index=index) + + +def panel_data3(): + index = bdate_range('1/1/2011', periods=10).shift(-2) + + return DataFrame({ + 'A' : [nan, nan, nan, 0, 1, 2, 3, 4, 5, 6], + 'B' : [0, 1, 2, 3, 4, 5, 6, nan, nan, nan], + 'C' : [0, 1, 2, nan, nan, nan, 3, 4, 5, 6], + 'D' : [nan, 0, 1, nan, 2, 3, 4, 5, 6, nan] + }, index=index) + +class TestSparsePanel(TestCase, + test_panel.SafeForLongAndSparse, + test_panel.SafeForSparse): + + @classmethod + def assert_panel_equal(cls, x, y): + assert_sp_panel_equal(x, y) + + def setUp(self): + self.data_dict = { + 'ItemA' : panel_data1(), + 'ItemB' : panel_data2(), + 'ItemC' : panel_data3(), + 'ItemD' : panel_data1(), + } + self.panel = SparsePanel(self.data_dict) + + @staticmethod + def _test_op(panel, op): + # arithmetic tests + result = op(panel, 1) + assert_sp_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) + + def test_constructor(self): + self.assertRaises(Exception, SparsePanel, self.data_dict, + items=['Item0', 'ItemA', 'ItemB']) + + def test_from_dict(self): + fd = SparsePanel.from_dict(self.data_dict) + assert_sp_panel_equal(fd, self.panel) + + def test_pickle(self): + def _test_roundtrip(panel): + pickled = pickle.dumps(panel, protocol=pickle.HIGHEST_PROTOCOL) + unpickled = pickle.loads(pickled) + self.assert_(isinstance(unpickled.items, Index)) + self.assert_(isinstance(unpickled.major_axis, Index)) + self.assert_(isinstance(unpickled.minor_axis, Index)) + assert_sp_panel_equal(panel, unpickled) + + _test_roundtrip(self.panel) + + def test_dense_to_sparse(self): + wp = Panel.from_dict(self.data_dict) + dwp = wp.to_sparse() + self.assert_(isinstance(dwp['ItemA']['A'], SparseSeries)) + + def test_to_dense(self): + dwp = self.panel.to_dense() + dwp2 = Panel.from_dict(self.data_dict) + assert_panel_equal(dwp, dwp2) + + def test_to_frame(self): + def _compare_with_dense(panel): + slp = panel.to_frame() + dlp = panel.to_dense().to_frame() + + self.assert_(np.array_equal(slp.values, dlp.values)) + self.assert_(slp.index.equals(dlp.index)) + + _compare_with_dense(self.panel) + _compare_with_dense(self.panel.reindex(items=['ItemA'])) + + zero_panel = SparsePanel(self.data_dict, default_fill_value=0) + self.assertRaises(Exception, zero_panel.to_frame) + + self.assertRaises(Exception, self.panel.to_frame, + filter_observations=False) + + def test_long_to_wide_sparse(self): + pass + + def test_values(self): + pass + + def test_setitem(self): + self.panel['ItemE'] = self.panel['ItemC'] + self.panel['ItemF'] = self.panel['ItemC'].to_dense() + + assert_sp_frame_equal(self.panel['ItemE'], self.panel['ItemC']) + assert_sp_frame_equal(self.panel['ItemF'], self.panel['ItemC']) + assert_almost_equal(self.panel.items, ['ItemA', 'ItemB', 'ItemC', + 'ItemD', 'ItemE', 'ItemF']) + + self.assertRaises(Exception, self.panel.__setitem__, 'item6', 1) + + def test_set_value(self): + def _check_loc(item, major, minor, val=1.5): + res = self.panel.set_value(item, major, minor, val) + self.assert_(res is not self.panel) + self.assertEquals(res.get_value(item, major, minor), val) + + _check_loc('ItemA', self.panel.major_axis[4], self.panel.minor_axis[3]) + _check_loc('ItemF', self.panel.major_axis[4], self.panel.minor_axis[3]) + _check_loc('ItemF', 'foo', self.panel.minor_axis[3]) + _check_loc('ItemE', 'foo', 'bar') + + def test_delitem_pop(self): + del self.panel['ItemB'] + assert_almost_equal(self.panel.items, ['ItemA', 'ItemC', 'ItemD']) + crackle = self.panel['ItemC'] + pop = self.panel.pop('ItemC') + self.assert_(pop is crackle) + assert_almost_equal(self.panel.items, ['ItemA', 'ItemD']) + + self.assertRaises(KeyError, self.panel.__delitem__, 'ItemC') + + def test_copy(self): + cop = self.panel.copy() + assert_sp_panel_equal(cop, self.panel) + + def test_reindex(self): + def _compare_with_dense(swp, items, major, minor): + swp_re = swp.reindex(items=items, major=major, + minor=minor) + dwp_re = swp.to_dense().reindex(items=items, major=major, + minor=minor) + assert_panel_equal(swp_re.to_dense(), dwp_re) + + _compare_with_dense(self.panel, self.panel.items[:2], + self.panel.major_axis[::2], + self.panel.minor_axis[::2]) + _compare_with_dense(self.panel, None, + self.panel.major_axis[::2], + self.panel.minor_axis[::2]) + + self.assertRaises(ValueError, self.panel.reindex) + + # TODO: do something about this later... + self.assertRaises(Exception, self.panel.reindex, + items=['item0', 'ItemA', 'ItemB']) + + # test copying + cp = self.panel.reindex(self.panel.major_axis, copy=True) + cp['ItemA']['E'] = cp['ItemA']['A'] + self.assert_('E' not in self.panel['ItemA']) + + def test_operators(self): + def _check_ops(panel): + def _dense_comp(op): + dense = panel.to_dense() + sparse_result = op(panel) + dense_result = op(dense) + assert_panel_equal(sparse_result.to_dense(), dense_result) + + def _mixed_comp(op): + result = op(panel, panel.to_dense()) + expected = op(panel.to_dense(), panel.to_dense()) + assert_panel_equal(result, expected) + + op1 = lambda x: x + 2 + + _dense_comp(op1) + op2 = lambda x: x.add(x.reindex(major=x.major_axis[::2])) + _dense_comp(op2) + op3 = lambda x: x.subtract(x.mean(0), axis=0) + _dense_comp(op3) + op4 = lambda x: x.subtract(x.mean(1), axis=1) + _dense_comp(op4) + op5 = lambda x: x.subtract(x.mean(2), axis=2) + _dense_comp(op5) + + _mixed_comp(Panel.multiply) + _mixed_comp(Panel.subtract) + + # TODO: this case not yet supported! + # op6 = lambda x: x.add(x.to_frame()) + # _dense_comp(op6) + + _check_ops(self.panel) + + def test_major_xs(self): + def _dense_comp(sparse): + dense = sparse.to_dense() + + for idx in sparse.major_axis: + dslice = dense.major_xs(idx) + sslice = sparse.major_xs(idx) + assert_frame_equal(dslice, sslice) + + _dense_comp(self.panel) + + def test_minor_xs(self): + def _dense_comp(sparse): + dense = sparse.to_dense() + + for idx in sparse.minor_axis: + dslice = dense.minor_xs(idx) + sslice = sparse.minor_xs(idx).to_dense() + assert_frame_equal(dslice, sslice) + + _dense_comp(self.panel) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + + # nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure', + # '--with-profile'], + # exit=False) diff --git a/pandas/src/cppsandbox.pyx b/pandas/src/cppsandbox.pyx new file mode 100644 index 00000000..0210f6d5 --- /dev/null +++ b/pandas/src/cppsandbox.pyx @@ -0,0 +1,15 @@ +from libcpp.map cimport map +from numpy cimport ndarray, int32_t + +ctypedef int32_t i4 + +def map_indices(ndarray[i4] values): + cdef: + i4 i, n + map[i4, i4] mapping + + mapping = map[i4, i4]() + + n = len(values) + for i in range(n): + mapping[i] = values[i] diff --git a/pandas/src/data_algos.pyx b/pandas/src/data_algos.pyx new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/pandas/src/data_algos.pyx @@ -0,0 +1 @@ + diff --git a/pandas/src/datetime.pxd b/pandas/src/datetime.pxd new file mode 100644 index 00000000..2147c267 --- /dev/null +++ b/pandas/src/datetime.pxd @@ -0,0 +1,115 @@ +from numpy cimport int64_t, int32_t, npy_int64, npy_int32 +from cpython cimport PyObject + + +cdef extern from "stdint.h": + enum: INT64_MIN + enum: INT32_MIN + + + +cdef extern from "datetime.h": + + ctypedef class datetime.date [object PyDateTime_Date]: + pass + + ctypedef class datetime.datetime [object PyDateTime_DateTime]: + pass + + ctypedef class datetime.timedelta [object PyDateTime_Delta]: + pass + + void PyDateTime_IMPORT() + + int PyDateTime_GET_YEAR(date) + int PyDateTime_GET_MONTH(date) + int PyDateTime_GET_DAY(date) + int PyDateTime_DATE_GET_HOUR(object o) + int PyDateTime_DATE_GET_MINUTE(object o) + int PyDateTime_DATE_GET_SECOND(object o) + int PyDateTime_DATE_GET_MICROSECOND(object o) + int PyDateTime_TIME_GET_HOUR(object o) + int PyDateTime_TIME_GET_MINUTE(object o) + int PyDateTime_TIME_GET_SECOND(object o) + int PyDateTime_TIME_GET_MICROSECOND(object o) + bint PyDateTime_Check(object o) + bint PyDate_Check(object o) + bint PyTime_Check(object o) + object PyDateTime_FromDateAndTime(int year, int month, int day, int hour, + int minute, int second, int us) + +cdef extern from "datetime_helper.h": + void mangle_nat(object o) + +cdef extern from "numpy/ndarrayobject.h": + + ctypedef int64_t npy_timedelta + ctypedef int64_t npy_datetime + + ctypedef enum NPY_CASTING: + NPY_NO_CASTING + NPY_EQUIV_CASTING + NPY_SAFE_CASTING + NPY_SAME_KIND_CASTING + NPY_UNSAFE_CASTING + + +cdef extern from "numpy_helper.h": + npy_datetime get_datetime64_value(object o) + +cdef extern from "numpy/npy_common.h": + + ctypedef unsigned char npy_bool + +cdef extern from "datetime/np_datetime.h": + + ctypedef enum PANDAS_DATETIMEUNIT: + PANDAS_FR_Y + PANDAS_FR_M + PANDAS_FR_W + PANDAS_FR_D + PANDAS_FR_B + PANDAS_FR_h + PANDAS_FR_m + PANDAS_FR_s + PANDAS_FR_ms + PANDAS_FR_us + PANDAS_FR_ns + PANDAS_FR_ps + PANDAS_FR_fs + PANDAS_FR_as + + ctypedef struct pandas_datetimestruct: + npy_int64 year + npy_int32 month, day, hour, min, sec, us, ps, as + + int convert_pydatetime_to_datetimestruct(PyObject *obj, + pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo) + + npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d) + void pandas_datetime_to_datetimestruct(npy_datetime val, + PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result) + int _days_per_month_table[2][12] + + int dayofweek(int y, int m, int d) + int is_leapyear(int64_t year) + PANDAS_DATETIMEUNIT get_datetime64_unit(object o) + +cdef extern from "datetime/np_datetime_strings.h": + + int parse_iso_8601_datetime(char *str, int len, PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, pandas_datetimestruct *out, + npy_bool *out_local, PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special) + + int make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting) + + int get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) + + # int parse_python_string(object obj, pandas_datetimestruct *out) except -1 diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx new file mode 100644 index 00000000..0804479a --- /dev/null +++ b/pandas/src/datetime.pyx @@ -0,0 +1,1295 @@ +# cython: profile=False +cimport numpy as np +import numpy as np + +from numpy cimport int32_t, int64_t, import_array, ndarray +from cpython cimport * + +# this is our datetime.pxd +from datetime cimport * +from util cimport is_integer_object, is_datetime64_object + +from datetime import timedelta +from dateutil.parser import parse as parse_date +cimport util + +from khash cimport * +import cython + +# initialize numpy +import_array() +#import_ufunc() + +# import datetime C API +PyDateTime_IMPORT + +# in numpy 1.7, will prob need the following: +# numpy_pydatetime_import + +try: + basestring +except NameError: # py3 + basestring = str + +def ints_to_pydatetime(ndarray[int64_t] arr, tz=None): + cdef: + Py_ssize_t i, n = len(arr) + pandas_datetimestruct dts + ndarray[object] result = np.empty(n, dtype=object) + + if tz is not None: + if tz is pytz.utc: + for i in range(n): + pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) + result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, tz) + else: + trans = _get_transitions(tz) + deltas = _get_deltas(tz) + for i in range(n): + # Adjust datetime64 timestamp, recompute datetimestruct + pos = trans.searchsorted(arr[i]) - 1 + inf = tz._transition_info[pos] + + pandas_datetime_to_datetimestruct(arr[i] + deltas[pos], + PANDAS_FR_ns, &dts) + result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us, + tz._tzinfos[inf]) + else: + for i in range(n): + pandas_datetime_to_datetimestruct(arr[i], PANDAS_FR_ns, &dts) + result[i] = datetime(dts.year, dts.month, dts.day, dts.hour, + dts.min, dts.sec, dts.us) + + return result + + + +# Python front end to C extension type _Timestamp +# This serves as the box for datetime64 +class Timestamp(_Timestamp): + + def __new__(cls, object ts_input, object offset=None, tz=None): + cdef _TSObject ts + cdef _Timestamp ts_base + + if isinstance(ts_input, float): + # to do, do we want to support this, ie with fractional seconds? + raise TypeError("Cannot convert a float to datetime") + + if isinstance(ts_input, basestring): + try: + ts_input = parse_date(ts_input) + except Exception: + pass + + ts = convert_to_tsobject(ts_input, tz) + + if ts.value == NPY_NAT: + return NaT + + # make datetime happy + ts_base = _Timestamp.__new__(cls, ts.dts.year, ts.dts.month, + ts.dts.day, ts.dts.hour, ts.dts.min, + ts.dts.sec, ts.dts.us, ts.tzinfo) + + # fill out rest of data + ts_base.value = ts.value + ts_base.offset = offset + ts_base.nanosecond = ts.dts.ps / 1000 + + return ts_base + + def __repr__(self): + result = self._repr_base + + try: + result += self.strftime('%z') + if self.tzinfo: + result += self.strftime(' %%Z, tz=%s' % self.tzinfo.zone) + except ValueError: + year2000 = self.replace(year=2000) + result += year2000.strftime('%z') + if self.tzinfo: + result += year2000.strftime(' %%Z, tz=%s' % self.tzinfo.zone) + + return '' % result + + @property + def _repr_base(self): + result = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (self.year, self.month, + self.day, self.hour, + self.minute, self.second) + + if self.nanosecond != 0: + nanos = self.nanosecond + 1000 * self.microsecond + result += '.%.9d' % nanos + elif self.microsecond != 0: + result += '.%.6d' % self.microsecond + + return result + + @property + def tz(self): + """ + Alias for tzinfo + """ + return self.tzinfo + + @property + def freq(self): + return self.offset + + def __setstate__(self, state): + self.value = state[0] + self.offset = state[1] + self.tzinfo = state[2] + + def __reduce__(self): + object_state = self.value, self.offset, self.tzinfo + return (Timestamp, object_state) + + def to_period(self, freq=None): + """ + Return an period of which this timestamp is an observation. + """ + from pandas.tseries.period import Period + + if freq is None: + freq = self.freq + + return Period(self, freq=freq) + + @property + def dayofweek(self): + return self.weekday() + + @property + def dayofyear(self): + return self.day + + @property + def week(self): + return self._get_field('woy') + + weekofyear = week + + @property + def quarter(self): + return self._get_field('q') + + @property + def freqstr(self): + return getattr(self.offset, 'freqstr', self.offset) + + @property + def asm8(self): + return np.int64(self.value).view('M8[ns]') + + def tz_localize(self, tz): + """ + Convert naive Timestamp to local time zone + + Parameters + ---------- + tz : pytz.timezone + + Returns + ------- + localized : Timestamp + """ + if self.tzinfo is None: + # tz naive, localize + return Timestamp(self.to_pydatetime(), tz=tz) + else: + raise Exception('Cannot localize tz-aware Timestamp, use ' + 'tz_convert for conversions') + + def tz_convert(self, tz): + """ + Convert Timestamp to another time zone or localize to requested time + zone + + Parameters + ---------- + tz : pytz.timezone + + Returns + ------- + converted : Timestamp + """ + if self.tzinfo is None: + # tz naive, use tz_localize + raise Exception('Cannot convert tz-naive Timestamp, use ' + 'tz_localize to localize') + else: + # Same UTC timestamp, different time zone + return Timestamp(self.value, tz=tz) + + def replace(self, **kwds): + return Timestamp(datetime.replace(self, **kwds), + offset=self.offset) + + def to_pydatetime(self, warn=True): + """ + If warn=True, issue warning if nanoseconds is nonzero + """ + cdef: + pandas_datetimestruct dts + _TSObject ts + + if self.nanosecond != 0 and warn: + print 'Warning: discarding nonzero nanoseconds' + ts = convert_to_tsobject(self, self.tzinfo) + + return datetime(ts.dts.year, ts.dts.month, ts.dts.day, + ts.dts.hour, ts.dts.min, ts.dts.sec, + ts.dts.us, ts.tzinfo) + + +class NaTType(_NaT): + + def __new__(cls): + cdef _NaT base + + base = _NaT.__new__(cls, 1, 1, 1) + mangle_nat(base) + base.value = NPY_NAT + + return base + + def __repr__(self): + return 'NaT' + + def weekday(self): + return -1 + + def toordinal(self): + return -1 + +fields = ['year', 'quarter', 'month', 'day', 'hour', + 'minute', 'second', 'microsecond', 'nanosecond', + 'week', 'dayofyear'] +for field in fields: + prop = property(fget=lambda self: -1) + setattr(NaTType, field, prop) + + +NaT = NaTType() + +iNaT = util.get_nat() + + +cdef inline bint is_timestamp(object o): + return isinstance(o, Timestamp) + +def is_timestamp_array(ndarray[object] values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not is_timestamp(values[i]): + return False + return True + + +cpdef object get_value_box(ndarray arr, object loc): + cdef: + Py_ssize_t i, sz + void* data_ptr + if util.is_float_object(loc): + casted = int(loc) + if casted == loc: + loc = casted + i = loc + sz = cnp.PyArray_SIZE(arr) + + if i < 0 and sz > 0: + i += sz + elif i >= sz or sz == 0: + raise IndexError('index out of bounds') + + if arr.descr.type_num == NPY_DATETIME: + return Timestamp(util.get_value_1d(arr, i)) + else: + return util.get_value_1d(arr, i) + + +#---------------------------------------------------------------------- +# Frequency inference + +def unique_deltas(ndarray[int64_t] arr): + cdef: + Py_ssize_t i, n = len(arr) + int64_t val + khiter_t k + kh_int64_t *table + int ret = 0 + list uniques = [] + + table = kh_init_int64() + kh_resize_int64(table, 10) + for i in range(n - 1): + val = arr[i + 1] - arr[i] + k = kh_get_int64(table, val) + if k == table.n_buckets: + kh_put_int64(table, val, &ret) + uniques.append(val) + kh_destroy_int64(table) + + result = np.array(uniques, dtype=np.int64) + result.sort() + return result + + +cdef inline bint _is_multiple(int64_t us, int64_t mult): + return us % mult == 0 + + +def apply_offset(ndarray[object] values, object offset): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] new_values + object boxed + + result = np.empty(n, dtype='M8[ns]') + new_values = result.view('i8') + pass + + +# This is PITA. Because we inherit from datetime, which has very specific +# construction requirements, we need to do object instantiation in python +# (see Timestamp class above). This will serve as a C extension type that +# shadows the python class, where we do any heavy lifting. +cdef class _Timestamp(datetime): + cdef readonly: + int64_t value, nanosecond + object offset # frequency reference + + def __hash__(self): + if self.nanosecond: + return hash(self.value) + else: + return datetime.__hash__(self) + + def __richcmp__(_Timestamp self, object other, int op): + cdef _Timestamp ots + + if isinstance(other, _Timestamp): + ots = other + elif isinstance(other, datetime): + ots = Timestamp(other) + else: + if op == 2: + return False + elif op == 3: + return True + else: + raise TypeError('Cannot compare Timestamp with %s' % str(other)) + + if self.tzinfo is None: + if other.tzinfo is not None: + raise Exception('Cannot compare tz-naive and tz-aware timestamps') + elif other.tzinfo is None: + raise Exception('Cannot compare tz-naive and tz-aware timestamps') + + if op == 2: # == + return self.value == ots.value + elif op == 3: # != + return self.value != ots.value + elif op == 0: # < + return self.value < ots.value + elif op == 1: # <= + return self.value <= ots.value + elif op == 4: # > + return self.value > ots.value + elif op == 5: # >= + return self.value >= ots.value + + def __add__(self, other): + if is_integer_object(other): + if self.offset is None: + msg = ("Cannot add integral value to Timestamp " + "without offset.") + raise ValueError(msg) + else: + return Timestamp((self.offset.__mul__(other)).apply(self)) + else: + if isinstance(other, timedelta) or hasattr(other, 'delta'): + nanos = _delta_to_nanoseconds(other) + return Timestamp(self.value + nanos, tz=self.tzinfo) + else: + result = datetime.__add__(self, other) + if isinstance(result, datetime): + result = Timestamp(result) + result.nanosecond = self.nanosecond + return result + + def __sub__(self, other): + if is_integer_object(other): + return self.__add__(-other) + else: + return datetime.__sub__(self, other) + + cpdef _get_field(self, field): + out = get_date_field(np.array([self.value], dtype=np.int64), field) + return out[0] + + +cdef class _NaT(_Timestamp): + + def __richcmp__(_NaT self, object other, int op): + # if not isinstance(other, (_NaT, _Timestamp)): + # raise TypeError('Cannot compare %s with NaT' % type(other)) + + if op == 2: # == + return False + elif op == 3: # != + return True + elif op == 0: # < + return False + elif op == 1: # <= + return False + elif op == 4: # > + return False + elif op == 5: # >= + return False + + + + +def _delta_to_nanoseconds(delta): + try: + delta = delta.delta + except: + pass + return (delta.days * 24 * 60 * 60 * 1000000 + + delta.seconds * 1000000 + + delta.microseconds) * 1000 + + +# lightweight C object to hold datetime & int64 pair +cdef class _TSObject: + cdef: + pandas_datetimestruct dts # pandas_datetimestruct + int64_t value # numpy dt64 + object tzinfo + + property value: + def __get__(self): + return self.value + +# helper to extract datetime and int64 from several different possibilities +cpdef convert_to_tsobject(object ts, object tz=None): + """ + Extract datetime and int64 from any of: + - np.int64 + - np.datetime64 + - python int or long object + - iso8601 string object + - python datetime object + - another timestamp object + """ + cdef: + _TSObject obj + bint utc_convert = 1 + + if tz is not None: + if isinstance(tz, basestring): + tz = pytz.timezone(tz) + + obj = _TSObject() + + if is_datetime64_object(ts): + obj.value = _get_datetime64_nanos(ts) + pandas_datetime_to_datetimestruct(obj.value, PANDAS_FR_ns, &obj.dts) + elif is_integer_object(ts): + obj.value = ts + pandas_datetime_to_datetimestruct(ts, PANDAS_FR_ns, &obj.dts) + elif util.is_string_object(ts): + _string_to_dts(ts, &obj.dts) + obj.value = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &obj.dts) + elif PyDateTime_Check(ts): + if tz is not None: + # sort of a temporary hack + if ts.tzinfo is not None: + ts = tz.normalize(ts) + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + elif tz is not pytz.utc: + ts = tz.localize(ts) + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.value -= _delta_to_nanoseconds(ts.tzinfo._utcoffset) + obj.tzinfo = ts.tzinfo + else: + # UTC + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = tz + else: + obj.value = _pydatetime_to_dts(ts, &obj.dts) + obj.tzinfo = ts.tzinfo + if obj.tzinfo is not None: + obj.value -= _delta_to_nanoseconds(obj.tzinfo._utcoffset) + _check_dts_bounds(obj.value, &obj.dts) + return obj + elif PyDate_Check(ts): + obj.value = _date_to_datetime64(ts, &obj.dts) + else: + raise ValueError("Could not construct Timestamp from argument %s" % + type(ts)) + + if obj.value != NPY_NAT: + _check_dts_bounds(obj.value, &obj.dts) + + if tz is not None: + if tz is pytz.utc: + obj.tzinfo = tz + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans = _get_transitions(tz) + deltas = _get_deltas(tz) + pos = trans.searchsorted(obj.value, side='right') - 1 + inf = tz._transition_info[pos] + + pandas_datetime_to_datetimestruct(obj.value + deltas[pos], + PANDAS_FR_ns, &obj.dts) + obj.tzinfo = tz._tzinfos[inf] + + return obj + +cdef int64_t _NS_LOWER_BOUND = -9223285636854775809LL +cdef int64_t _NS_UPPER_BOUND = -9223372036854775807LL + +cdef inline _check_dts_bounds(int64_t value, pandas_datetimestruct *dts): + cdef pandas_datetimestruct dts2 + if dts.year <= 1677 or dts.year >= 2262: + pandas_datetime_to_datetimestruct(value, PANDAS_FR_ns, &dts2) + if dts2.year != dts.year: + fmt = '%d-%.2d-%.2d %.2d:%.2d:%.2d' % (dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec) + + raise ValueError('Out of bounds nanosecond timestamp: %s' % fmt) + +# elif isinstance(ts, _Timestamp): +# tmp = ts +# obj.value = (<_Timestamp> ts).value +# obj.dtval = +# elif isinstance(ts, object): +# # If all else fails +# obj.value = _dtlike_to_datetime64(ts, &obj.dts) +# obj.dtval = _dts_to_pydatetime(&obj.dts) + +cdef inline object _datetime64_to_datetime(int64_t val): + cdef pandas_datetimestruct dts + pandas_datetime_to_datetimestruct(val, PANDAS_FR_ns, &dts) + return _dts_to_pydatetime(&dts) + +cdef inline object _dts_to_pydatetime(pandas_datetimestruct *dts): + return PyDateTime_FromDateAndTime(dts.year, dts.month, + dts.day, dts.hour, + dts.min, dts.sec, dts.us) + +cdef inline int64_t _pydatetime_to_dts(object val, pandas_datetimestruct *dts): + dts.year = PyDateTime_GET_YEAR(val) + dts.month = PyDateTime_GET_MONTH(val) + dts.day = PyDateTime_GET_DAY(val) + dts.hour = PyDateTime_DATE_GET_HOUR(val) + dts.min = PyDateTime_DATE_GET_MINUTE(val) + dts.sec = PyDateTime_DATE_GET_SECOND(val) + dts.us = PyDateTime_DATE_GET_MICROSECOND(val) + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + +cdef inline int64_t _dtlike_to_datetime64(object val, + pandas_datetimestruct *dts): + dts.year = val.year + dts.month = val.month + dts.day = val.day + dts.hour = val.hour + dts.min = val.minute + dts.sec = val.second + dts.us = val.microsecond + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + +cdef inline int64_t _date_to_datetime64(object val, + pandas_datetimestruct *dts): + dts.year = PyDateTime_GET_YEAR(val) + dts.month = PyDateTime_GET_MONTH(val) + dts.day = PyDateTime_GET_DAY(val) + dts.hour = dts.min = dts.sec = dts.us = 0 + dts.ps = dts.as = 0 + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, dts) + + +cdef inline _string_to_dts(object val, pandas_datetimestruct* dts): + cdef: + npy_bool islocal, special + PANDAS_DATETIMEUNIT out_bestunit + int result + + if PyUnicode_Check(val): + val = PyUnicode_AsASCIIString(val); + result = parse_iso_8601_datetime(val, len(val), PANDAS_FR_ns, NPY_UNSAFE_CASTING, + dts, &islocal, &out_bestunit, &special) + if result == -1: + raise ValueError('Unable to parse %s' % str(val)) + +def array_to_datetime(ndarray[object] values, raise_=False, dayfirst=False): + cdef: + Py_ssize_t i, n = len(values) + object val + ndarray[int64_t] iresult + ndarray[object] oresult + pandas_datetimestruct dts + + from dateutil.parser import parse + + try: + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + for i in range(n): + val = values[i] + if util._checknull(val): + iresult[i] = iNaT + elif PyDateTime_Check(val): + iresult[i] = _pydatetime_to_dts(val, &dts) + _check_dts_bounds(iresult[i], &dts) + elif PyDate_Check(val): + iresult[i] = _date_to_datetime64(val, &dts) + _check_dts_bounds(iresult[i], &dts) + elif util.is_datetime64_object(val): + iresult[i] = _get_datetime64_nanos(val) + elif util.is_integer_object(val): + iresult[i] = val + else: + if len(val) == 0: + iresult[i] = iNaT + continue + try: + result[i] = parse(val, dayfirst=dayfirst) + except Exception: + raise TypeError + pandas_datetime_to_datetimestruct(iresult[i], PANDAS_FR_ns, + &dts) + _check_dts_bounds(iresult[i], &dts) + return result + except TypeError: + oresult = np.empty(n, dtype=object) + + for i in range(n): + val = values[i] + if util._checknull(val): + oresult[i] = val + else: + if len(val) == 0: + # TODO: ?? + oresult[i] = 'NaT' + continue + try: + oresult[i] = parse(val, dayfirst=dayfirst) + except Exception: + if raise_: + raise + return values + # oresult[i] = val + + return oresult + +cdef inline _get_datetime64_nanos(object val): + cdef: + pandas_datetimestruct dts + PANDAS_DATETIMEUNIT unit + npy_datetime ival + + unit = get_datetime64_unit(val) + if unit == 3: + raise ValueError('NumPy 1.6.1 business freq not supported') + + ival = get_datetime64_value(val) + + if unit != PANDAS_FR_ns: + pandas_datetime_to_datetimestruct(ival, unit, &dts) + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + else: + return ival + + +def cast_to_nanoseconds(ndarray arr): + cdef: + Py_ssize_t i, n = arr.size + ndarray[int64_t] ivalues, iresult + PANDAS_DATETIMEUNIT unit + pandas_datetimestruct dts + + shape = ( arr).shape + + ivalues = arr.view(np.int64).ravel() + + result = np.empty(shape, dtype='M8[ns]') + iresult = result.ravel().view(np.int64) + + unit = get_datetime64_unit(arr.flat[0]) + if unit == 3: + raise ValueError('NumPy 1.6.1 business freq not supported') + + for i in range(n): + pandas_datetime_to_datetimestruct(ivalues[i], unit, &dts) + iresult[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + + return result + +#---------------------------------------------------------------------- +# Conversion routines + + +def pydt_to_i8(object pydt): + ''' + Convert to int64 representation compatible with numpy datetime64; converts + to UTC + ''' + cdef: + _TSObject ts + + ts = convert_to_tsobject(pydt) + + return ts.value + +def i8_to_pydt(int64_t i8, object tzinfo = None): + ''' + Inverse of pydt_to_i8 + ''' + return Timestamp(i8) + +#---------------------------------------------------------------------- +# time zone conversion helpers + +try: + import pytz + have_pytz = True +except: + have_pytz = False + +def tz_convert(ndarray[int64_t] vals, object tz1, object tz2): + cdef: + ndarray[int64_t] utc_dates, result, trans, deltas + Py_ssize_t i, pos, n = len(vals) + int64_t v, offset + + if not have_pytz: + import pytz + + # Convert to UTC + + if tz1.zone != 'UTC': + utc_dates = np.empty(n, dtype=np.int64) + deltas = _get_deltas(tz1) + trans = _get_transitions(tz1) + pos = trans.searchsorted(vals[0]) - 1 + if pos < 0: + raise ValueError('First time before start of DST info') + + offset = deltas[pos] + for i in range(n): + v = vals[i] + if v >= trans[pos + 1]: + pos += 1 + offset = deltas[pos] + utc_dates[i] = v - offset + else: + utc_dates = vals + + if tz2.zone == 'UTC': + return utc_dates + + # Convert UTC to other timezone + + result = np.empty(n, dtype=np.int64) + trans = _get_transitions(tz2) + deltas = _get_deltas(tz2) + pos = trans.searchsorted(utc_dates[0]) - 1 + if pos < 0: + raise ValueError('First time before start of DST info') + + offset = deltas[pos] + for i in range(n): + v = utc_dates[i] + if v >= trans[pos + 1]: + pos += 1 + offset = deltas[pos] + result[i] = v + offset + + return result + +def tz_convert_single(int64_t val, object tz1, object tz2): + cdef: + ndarray[int64_t] trans, deltas + Py_ssize_t pos + int64_t v, offset, utc_date + + + if not have_pytz: + import pytz + + # Convert to UTC + + if tz1.zone != 'UTC': + deltas = _get_deltas(tz1) + trans = _get_transitions(tz1) + pos = trans.searchsorted(val) - 1 + if pos < 0: + raise ValueError('First time before start of DST info') + offset = deltas[pos] + utc_date = val - offset + else: + utc_date = val + + if tz2.zone == 'UTC': + return utc_date + + # Convert UTC to other timezone + trans = _get_transitions(tz2) + deltas = _get_deltas(tz2) + pos = trans.searchsorted(utc_date) - 1 + if pos < 0: + raise ValueError('First time before start of DST info') + + offset = deltas[pos] + return utc_date + offset + + +trans_cache = {} +utc_offset_cache = {} + +def _get_transitions(tz): + """ + Get UTC times of DST transitions + """ + if tz not in trans_cache: + arr = np.array(tz._utc_transition_times, dtype='M8[ns]') + trans_cache[tz] = arr.view('i8') + return trans_cache[tz] + +def _get_deltas(tz): + """ + Get UTC offsets in microseconds corresponding to DST transitions + """ + if tz not in utc_offset_cache: + utc_offset_cache[tz] = _unbox_utcoffsets(tz._transition_info) + return utc_offset_cache[tz] + +cdef double total_seconds(object td): # Python 2.6 compat + return ((td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) // + 10**6) + +cpdef ndarray _unbox_utcoffsets(object transinfo): + cdef: + Py_ssize_t i, sz + ndarray[int64_t] arr + + sz = len(transinfo) + arr = np.empty(sz, dtype='i8') + + for i in range(sz): + arr[i] = int(total_seconds(transinfo[i][0])) * 1000000000 + + return arr + + +def tz_localize_check(ndarray[int64_t] vals, object tz): + """ + Localize tzinfo-naive DateRange to given time zone (using pytz). If + there are ambiguities in the values, raise AmbiguousTimeError. + + Returns + ------- + localized : DatetimeIndex + """ + cdef: + ndarray[int64_t] trans, deltas + Py_ssize_t i, pos, n = len(vals) + int64_t v, dst_start, dst_end + + if not have_pytz: + raise Exception("Could not find pytz module") + + if tz == pytz.utc or tz is None: + return + + trans = _get_transitions(tz) + deltas = _get_deltas(tz) + + pos = np.searchsorted(trans, vals[0]) + dst_start = trans[pos] + deltas[pos - 1] + dst_end = trans[pos] + deltas[pos] + + for i in range(n): + v = vals[i] + if v >= trans[pos + 1]: + pos += 1 + dst_start = trans[pos] + deltas[pos - 1] + dst_end = trans[pos] + deltas[pos] + + if dst_start > dst_end: + dst_end, dst_start = dst_start, dst_end + + if dst_start <= v and v <= dst_end: + msg = "Cannot localize, ambiguous time %s found" % Timestamp(v) + raise pytz.AmbiguousTimeError(msg) + +@cython.boundscheck(False) +@cython.wraparound(False) +def tz_localize_to_utc(ndarray[int64_t] vals, object tz): + """ + Localize tzinfo-naive DateRange to given time zone (using pytz). If + there are ambiguities in the values, raise AmbiguousTimeError. + + Returns + ------- + localized : DatetimeIndex + """ + cdef: + ndarray[int64_t] trans, deltas, idx_shifted + Py_ssize_t i, idx, pos, ntrans, n = len(vals) + int64_t *tdata + int64_t v, left, right + ndarray[int64_t] result, result_a, result_b + + # Vectorized version of DstTzInfo.localize + + if not have_pytz: + raise Exception("Could not find pytz module") + + if tz == pytz.utc or tz is None: + return vals + + trans = _get_transitions(tz) # transition dates + deltas = _get_deltas(tz) # utc offsets + + tdata = trans.data + ntrans = len(trans) + + result = np.empty(n, dtype=np.int64) + result_a = np.empty(n, dtype=np.int64) + result_b = np.empty(n, dtype=np.int64) + result_a.fill(NPY_NAT) + result_b.fill(NPY_NAT) + + # left side + idx_shifted = _ensure_int64( + np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1)) + + for i in range(n): + v = vals[i] - deltas[idx_shifted[i]] + pos = bisect_right_i8(tdata, v, ntrans) - 1 + + # timestamp falls to the left side of the DST transition + if v + deltas[pos] == vals[i]: + result_a[i] = v + + # right side + idx_shifted = _ensure_int64( + np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1)) + + for i in range(n): + v = vals[i] - deltas[idx_shifted[i]] + pos = bisect_right_i8(tdata, v, ntrans) - 1 + + # timestamp falls to the right side of the DST transition + if v + deltas[pos] == vals[i]: + result_b[i] = v + + for i in range(n): + left = result_a[i] + right = result_b[i] + if left != NPY_NAT and right != NPY_NAT: + if left == right: + result[i] = left + else: + stamp = Timestamp(vals[i]) + raise pytz.AmbiguousTimeError(stamp) + elif left != NPY_NAT: + result[i] = left + elif right != NPY_NAT: + result[i] = right + else: + stamp = Timestamp(vals[i]) + raise pytz.NonExistentTimeError(stamp) + + return result + +cdef _ensure_int64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT64: + return arr + else: + return arr.astype(np.int64) + else: + return np.array(arr, dtype=np.int64) + + +cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n): + cdef Py_ssize_t pivot, left = 0, right = n + + # edge cases + if val > data[n - 1]: + return n + + if val < data[0]: + return 0 + + while left < right: + pivot = left + (right - left) // 2 + + if data[pivot] <= val: + left = pivot + 1 + else: + right = pivot + + return left + + +# Accessors +#---------------------------------------------------------------------- + +def build_field_sarray(ndarray[int64_t] dtindex): + ''' + Datetime as int64 representation to a structured array of fields + ''' + cdef: + Py_ssize_t i, count = 0 + int isleap + pandas_datetimestruct dts + ndarray[int32_t] years, months, days, hours, minutes, seconds, mus + + count = len(dtindex) + + sa_dtype = [('Y', 'i4'), # year + ('M', 'i4'), # month + ('D', 'i4'), # day + ('h', 'i4'), # hour + ('m', 'i4'), # min + ('s', 'i4'), # second + ('u', 'i4')] # microsecond + + out = np.empty(count, dtype=sa_dtype) + + years = out['Y'] + months = out['M'] + days = out['D'] + hours = out['h'] + minutes = out['m'] + seconds = out['s'] + mus = out['u'] + + for i in range(count): + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + years[i] = dts.year + months[i] = dts.month + days[i] = dts.day + hours[i] = dts.hour + minutes[i] = dts.min + seconds[i] = dts.sec + mus[i] = dts.us + + return out + +def get_time_micros(ndarray[int64_t] dtindex): + ''' + Datetime as int64 representation to a structured array of fields + ''' + cdef: + Py_ssize_t i, n = len(dtindex) + pandas_datetimestruct dts + ndarray[int64_t] micros + + micros = np.empty(n, dtype=np.int64) + + for i in range(n): + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + micros[i] = 1000000LL * (dts.hour * 60 * 60 + + 60 * dts.min + dts.sec) + dts.us + + return micros + +@cython.wraparound(False) +def get_date_field(ndarray[int64_t] dtindex, object field): + ''' + Given a int64-based datetime index, extract the year, month, etc., + field and return an array of these values. + ''' + cdef: + _TSObject ts + Py_ssize_t i, count = 0 + ndarray[int32_t] out + ndarray[int32_t, ndim=2] _month_offset + int isleap + pandas_datetimestruct dts + + _month_offset = np.array( + [[ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 ], + [ 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 ]], + dtype=np.int32 ) + + count = len(dtindex) + out = np.empty(count, dtype='i4') + + if field == 'Y': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.year + return out + + elif field == 'M': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.month + return out + + elif field == 'D': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.day + return out + + elif field == 'h': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.hour + return out + + elif field == 'm': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.min + return out + + elif field == 's': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.sec + return out + + elif field == 'us': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.us + return out + elif field == 'ns': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.ps / 1000 + return out + elif field == 'doy': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + isleap = is_leapyear(dts.year) + out[i] = _month_offset[isleap, dts.month-1] + dts.day + return out + + elif field == 'dow': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + ts = convert_to_tsobject(dtindex[i]) + out[i] = ts_dayofweek(ts) + return out + + elif field == 'woy': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + isleap = is_leapyear(dts.year) + out[i] = _month_offset[isleap, dts.month - 1] + dts.day + out[i] = ((out[i] - 1) / 7) + 1 + return out + + elif field == 'q': + for i in range(count): + if dtindex[i] == NPY_NAT: out[i] = -1; continue + + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + out[i] = dts.month + out[i] = ((out[i] - 1) / 3) + 1 + return out + + raise ValueError("Field %s not supported" % field) + + +cdef inline int m8_weekday(int64_t val): + ts = convert_to_tsobject(val) + return ts_dayofweek(ts) + +cdef int64_t DAY_NS = 86400000000000LL + + +def date_normalize(ndarray[int64_t] stamps): + cdef: + Py_ssize_t i, n = len(stamps) + ndarray[int64_t] result = np.empty(n, dtype=np.int64) + pandas_datetimestruct dts + + for i in range(n): + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + dts.hour = 0 + dts.min = 0 + dts.sec = 0 + dts.us = 0 + result[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + + return result + +def dates_normalized(ndarray[int64_t] stamps): + cdef: + Py_ssize_t i, n = len(stamps) + pandas_datetimestruct dts + + for i in range(n): + pandas_datetime_to_datetimestruct(stamps[i], PANDAS_FR_ns, &dts) + if (dts.hour + dts.min + dts.sec + dts.us) > 0: + return False + + return True + +# Some general helper functions +#---------------------------------------------------------------------- + +def isleapyear(int64_t year): + return is_leapyear(year) + +def monthrange(int64_t year, int64_t month): + cdef: + int64_t days + int64_t day_of_week + + if month < 1 or month > 12: + raise ValueError("bad month number 0; must be 1-12") + + days = _days_per_month_table[is_leapyear(year)][month-1] + + return (dayofweek(year, month, 1), days) + +cdef inline int64_t ts_dayofweek(_TSObject ts): + return dayofweek(ts.dts.year, ts.dts.month, ts.dts.day) + diff --git a/pandas/src/datetime/np_datetime.c b/pandas/src/datetime/np_datetime.c new file mode 100644 index 00000000..ad2ffacb --- /dev/null +++ b/pandas/src/datetime/np_datetime.c @@ -0,0 +1,949 @@ +/* + * This is derived from Numpy 1.7 + * + * See NP_LICENSE.txt + */ + +#define NO_IMPORT + +#include +#include + +/* #define __MSVCRT_VERSION__ 0x0700 /\* whatever above 0x0601 *\/ */ +/* #include */ +/* #define time_t __time64_t */ +/* #define localtime _localtime64 */ +/* #define time _time64 */ + +#include +#include +#include "np_datetime.h" + +#if PY_MAJOR_VERSION >= 3 + #define PyIntObject PyLongObject + #define PyInt_Type PyLong_Type + #define PyInt_Check(op) PyLong_Check(op) + #define PyInt_CheckExact(op) PyLong_CheckExact(op) + #define PyInt_FromString PyLong_FromString + #define PyInt_FromUnicode PyLong_FromUnicode + #define PyInt_FromLong PyLong_FromLong + #define PyInt_FromSize_t PyLong_FromSize_t + #define PyInt_FromSsize_t PyLong_FromSsize_t + #define PyInt_AsLong PyLong_AsLong + #define PyInt_AS_LONG PyLong_AS_LONG + #define PyInt_AsSsize_t PyLong_AsSsize_t + #define PyInt_AsUnsignedLongMask PyLong_AsUnsignedLongMask + #define PyInt_AsUnsignedLongLongMask PyLong_AsUnsignedLongLongMask +#endif + +/* + * Returns 1 if the given year is a leap year, 0 otherwise. + */ +int is_leapyear(npy_int64 year) +{ + return (year & 0x3) == 0 && /* year % 4 == 0 */ + ((year % 100) != 0 || + (year % 400) == 0); +} + +/* + * Sakamoto's method, from wikipedia + */ +int dayofweek(int y, int m, int d) +{ + int day; + static int t[] = {0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4}; + y -= m < 3; + day = (y + y/4 - y/100 + y/400 + t[m-1] + d) % 7; + // convert to python day + return (day + 6) % 7; +} + +/* + * Adjusts a datetimestruct based on a minutes offset. Assumes + * the current values are valid.g + */ +void +add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes) +{ + int isleap; + + /* MINUTES */ + dts->min += minutes; + while (dts->min < 0) { + dts->min += 60; + dts->hour--; + } + while (dts->min >= 60) { + dts->min -= 60; + dts->hour++; + } + + /* HOURS */ + while (dts->hour < 0) { + dts->hour += 24; + dts->day--; + } + while (dts->hour >= 24) { + dts->hour -= 24; + dts->day++; + } + + /* DAYS */ + if (dts->day < 1) { + dts->month--; + if (dts->month < 1) { + dts->year--; + dts->month = 12; + } + isleap = is_leapyear(dts->year); + dts->day += _days_per_month_table[isleap][dts->month-1]; + } + else if (dts->day > 28) { + isleap = is_leapyear(dts->year); + if (dts->day > _days_per_month_table[isleap][dts->month-1]) { + dts->day -= _days_per_month_table[isleap][dts->month-1]; + dts->month++; + if (dts->month > 12) { + dts->year++; + dts->month = 1; + } + } + } +} + +/* + * Calculates the days offset from the 1970 epoch. + */ +npy_int64 +get_datetimestruct_days(const pandas_datetimestruct *dts) +{ + int i, month; + npy_int64 year, days = 0; + int *month_lengths; + + year = dts->year - 1970; + days = year * 365; + + /* Adjust for leap years */ + if (days >= 0) { + /* + * 1968 is the closest leap year before 1970. + * Exclude the current year, so add 1. + */ + year += 1; + /* Add one day for each 4 years */ + days += year / 4; + /* 1900 is the closest previous year divisible by 100 */ + year += 68; + /* Subtract one day for each 100 years */ + days -= year / 100; + /* 1600 is the closest previous year divisible by 400 */ + year += 300; + /* Add one day for each 400 years */ + days += year / 400; + } + else { + /* + * 1972 is the closest later year after 1970. + * Include the current year, so subtract 2. + */ + year -= 2; + /* Subtract one day for each 4 years */ + days += year / 4; + /* 2000 is the closest later year divisible by 100 */ + year -= 28; + /* Add one day for each 100 years */ + days -= year / 100; + /* 2000 is also the closest later year divisible by 400 */ + /* Subtract one day for each 400 years */ + days += year / 400; + } + + month_lengths = _days_per_month_table[is_leapyear(dts->year)]; + month = dts->month - 1; + + /* Add the months */ + for (i = 0; i < month; ++i) { + days += month_lengths[i]; + } + + /* Add the days */ + days += dts->day - 1; + + return days; +} + +/* + * Modifies '*days_' to be the day offset within the year, + * and returns the year. + */ +static npy_int64 +days_to_yearsdays(npy_int64 *days_) +{ + const npy_int64 days_per_400years = (400*365 + 100 - 4 + 1); + /* Adjust so it's relative to the year 2000 (divisible by 400) */ + npy_int64 days = (*days_) - (365*30 + 7); + npy_int64 year; + + /* Break down the 400 year cycle to get the year and day within the year */ + if (days >= 0) { + year = 400 * (days / days_per_400years); + days = days % days_per_400years; + } + else { + year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); + days = days % days_per_400years; + if (days < 0) { + days += days_per_400years; + } + } + + /* Work out the year/day within the 400 year cycle */ + if (days >= 366) { + year += 100 * ((days-1) / (100*365 + 25 - 1)); + days = (days-1) % (100*365 + 25 - 1); + if (days >= 365) { + year += 4 * ((days+1) / (4*365 + 1)); + days = (days+1) % (4*365 + 1); + if (days >= 366) { + year += (days-1) / 365; + days = (days-1) % 365; + } + } + } + + *days_ = days; + return year + 2000; +} + +/* + * Adjusts a datetimestruct based on a seconds offset. Assumes + * the current values are valid. + */ +NPY_NO_EXPORT void +add_seconds_to_datetimestruct(pandas_datetimestruct *dts, int seconds) +{ + int minutes; + + dts->sec += seconds; + if (dts->sec < 0) { + minutes = dts->sec / 60; + dts->sec = dts->sec % 60; + if (dts->sec < 0) { + --minutes; + dts->sec += 60; + } + add_minutes_to_datetimestruct(dts, minutes); + } + else if (dts->sec >= 60) { + minutes = dts->sec / 60; + dts->sec = dts->sec % 60; + add_minutes_to_datetimestruct(dts, minutes); + } +} + +/* + * Fills in the year, month, day in 'dts' based on the days + * offset from 1970. + */ +static void +set_datetimestruct_days(npy_int64 days, pandas_datetimestruct *dts) +{ + int *month_lengths, i; + + dts->year = days_to_yearsdays(&days); + month_lengths = _days_per_month_table[is_leapyear(dts->year)]; + + for (i = 0; i < 12; ++i) { + if (days < month_lengths[i]) { + dts->month = i + 1; + dts->day = days + 1; + return; + } + else { + days -= month_lengths[i]; + } + } +} + +/* + * + * Tests for and converts a Python datetime.datetime or datetime.date + * object into a NumPy pandas_datetimestruct. + * + * While the C API has PyDate_* and PyDateTime_* functions, the following + * implementation just asks for attributes, and thus supports + * datetime duck typing. The tzinfo time zone conversion would require + * this style of access anyway. + * + * 'out_bestunit' gives a suggested unit based on whether the object + * was a datetime.date or datetime.datetime object. + * + * If 'apply_tzinfo' is 1, this function uses the tzinfo to convert + * to UTC time, otherwise it returns the struct with the local time. + * + * Returns -1 on error, 0 on success, and 1 (with no error set) + * if obj doesn't have the neeeded date or datetime attributes. + */ +int +convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo) +{ + PyObject *tmp; + int isleap; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_datetimestruct)); + out->month = 1; + out->day = 1; + + /* Need at least year/month/day attributes */ + if (!PyObject_HasAttrString(obj, "year") || + !PyObject_HasAttrString(obj, "month") || + !PyObject_HasAttrString(obj, "day")) { + return 1; + } + + /* Get the year */ + tmp = PyObject_GetAttrString(obj, "year"); + if (tmp == NULL) { + return -1; + } + out->year = PyInt_AsLong(tmp); + if (out->year == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the month */ + tmp = PyObject_GetAttrString(obj, "month"); + if (tmp == NULL) { + return -1; + } + out->month = PyInt_AsLong(tmp); + if (out->month == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the day */ + tmp = PyObject_GetAttrString(obj, "day"); + if (tmp == NULL) { + return -1; + } + out->day = PyInt_AsLong(tmp); + if (out->day == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Validate that the month and day are valid for the year */ + if (out->month < 1 || out->month > 12) { + goto invalid_date; + } + isleap = is_leapyear(out->year); + if (out->day < 1 || + out->day > _days_per_month_table[isleap][out->month-1]) { + goto invalid_date; + } + + /* Check for time attributes (if not there, return success as a date) */ + if (!PyObject_HasAttrString(obj, "hour") || + !PyObject_HasAttrString(obj, "minute") || + !PyObject_HasAttrString(obj, "second") || + !PyObject_HasAttrString(obj, "microsecond")) { + /* The best unit for date is 'D' */ + if (out_bestunit != NULL) { + *out_bestunit = PANDAS_FR_D; + } + return 0; + } + + /* Get the hour */ + tmp = PyObject_GetAttrString(obj, "hour"); + if (tmp == NULL) { + return -1; + } + out->hour = PyInt_AsLong(tmp); + if (out->hour == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the minute */ + tmp = PyObject_GetAttrString(obj, "minute"); + if (tmp == NULL) { + return -1; + } + out->min = PyInt_AsLong(tmp); + if (out->min == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the second */ + tmp = PyObject_GetAttrString(obj, "second"); + if (tmp == NULL) { + return -1; + } + out->sec = PyInt_AsLong(tmp); + if (out->sec == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Get the microsecond */ + tmp = PyObject_GetAttrString(obj, "microsecond"); + if (tmp == NULL) { + return -1; + } + out->us = PyInt_AsLong(tmp); + if (out->us == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + if (out->hour < 0 || out->hour >= 24 || + out->min < 0 || out->min >= 60 || + out->sec < 0 || out->sec >= 60 || + out->us < 0 || out->us >= 1000000) { + goto invalid_time; + } + + /* Apply the time zone offset if it exists */ + if (apply_tzinfo && PyObject_HasAttrString(obj, "tzinfo")) { + tmp = PyObject_GetAttrString(obj, "tzinfo"); + if (tmp == NULL) { + return -1; + } + if (tmp == Py_None) { + Py_DECREF(tmp); + } + else { + PyObject *offset; + int seconds_offset, minutes_offset; + + /* The utcoffset function should return a timedelta */ + offset = PyObject_CallMethod(tmp, "utcoffset", "O", obj); + if (offset == NULL) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* + * The timedelta should have a function "total_seconds" + * which contains the value we want. + */ + tmp = PyObject_CallMethod(offset, "total_seconds", ""); + if (tmp == NULL) { + return -1; + } + seconds_offset = PyInt_AsLong(tmp); + if (seconds_offset == -1 && PyErr_Occurred()) { + Py_DECREF(tmp); + return -1; + } + Py_DECREF(tmp); + + /* Convert to a minutes offset and apply it */ + minutes_offset = seconds_offset / 60; + + add_minutes_to_datetimestruct(out, -minutes_offset); + } + } + + /* The resolution of Python's datetime is 'us' */ + if (out_bestunit != NULL) { + *out_bestunit = PANDAS_FR_us; + } + + return 0; + +invalid_date: + PyErr_Format(PyExc_ValueError, + "Invalid date (%d,%d,%d) when converting to NumPy datetime", + (int)out->year, (int)out->month, (int)out->day); + return -1; + +invalid_time: + PyErr_Format(PyExc_ValueError, + "Invalid time (%d,%d,%d,%d) when converting " + "to NumPy datetime", + (int)out->hour, (int)out->min, (int)out->sec, (int)out->us); + return -1; +} + +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *d) +{ + pandas_datetime_metadata meta; + npy_datetime result = PANDAS_DATETIME_NAT; + + meta.base = fr; + meta.num = 1; + + convert_datetimestruct_to_datetime(&meta, d, &result); + return result; +} + +void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result) +{ + pandas_datetime_metadata meta; + + meta.base = fr; + meta.num = 1; + + convert_datetime_to_datetimestruct(&meta, val, result); +} + +PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { + return ((PyDatetimeScalarObject *) obj)->obmeta.base; +} + + +/* + * Converts a datetime from a datetimestruct to a datetime based + * on some metadata. The date is assumed to be valid. + * + * TODO: If meta->num is really big, there could be overflow + * + * Returns 0 on success, -1 on failure. + */ +int +convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, + npy_datetime *out) +{ + npy_datetime ret; + PANDAS_DATETIMEUNIT base = meta->base; + + if (base == PANDAS_FR_Y) { + /* Truncate to the year */ + ret = dts->year - 1970; + } + else if (base == PANDAS_FR_M) { + /* Truncate to the month */ + ret = 12 * (dts->year - 1970) + (dts->month - 1); + } + else { + /* Otherwise calculate the number of days to start */ + npy_int64 days = get_datetimestruct_days(dts); + + switch (base) { + case PANDAS_FR_W: + /* Truncate to weeks */ + if (days >= 0) { + ret = days / 7; + } + else { + ret = (days - 6) / 7; + } + break; + case PANDAS_FR_D: + ret = days; + break; + case PANDAS_FR_h: + ret = days * 24 + + dts->hour; + break; + case PANDAS_FR_m: + ret = (days * 24 + + dts->hour) * 60 + + dts->min; + break; + case PANDAS_FR_s: + ret = ((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec; + break; + case PANDAS_FR_ms: + ret = (((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000 + + dts->us / 1000; + break; + case PANDAS_FR_us: + ret = (((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us; + break; + case PANDAS_FR_ns: + ret = ((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000 + + dts->ps / 1000; + break; + case PANDAS_FR_ps: + ret = ((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000000 + + dts->ps; + break; + case PANDAS_FR_fs: + /* only 2.6 hours */ + ret = (((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000000 + + dts->ps) * 1000 + + dts->as / 1000; + break; + case PANDAS_FR_as: + /* only 9.2 secs */ + ret = (((((days * 24 + + dts->hour) * 60 + + dts->min) * 60 + + dts->sec) * 1000000 + + dts->us) * 1000000 + + dts->ps) * 1000000 + + dts->as; + break; + default: + /* Something got corrupted */ + PyErr_SetString(PyExc_ValueError, + "NumPy datetime metadata with corrupt unit value"); + return -1; + } + } + + /* Divide by the multiplier */ + if (meta->num > 1) { + if (ret >= 0) { + ret /= meta->num; + } + else { + ret = (ret - meta->num + 1) / meta->num; + } + } + + *out = ret; + + return 0; +} + + +/* + * This provides the casting rules for the TIMEDELTA data type units. + * + * Notably, there is a barrier between the nonlinear years and + * months units, and all the other units. + */ +npy_bool +can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting) +{ + switch (casting) { + /* Allow anything with unsafe casting */ + case NPY_UNSAFE_CASTING: + return 1; + + /* + * Only enforce the 'date units' vs 'time units' barrier with + * 'same_kind' casting. + */ + case NPY_SAME_KIND_CASTING: + return (src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M); + + /* + * Enforce the 'date units' vs 'time units' barrier and that + * casting is only allowed towards more precise units with + * 'safe' casting. + */ + case NPY_SAFE_CASTING: + return (src_unit <= dst_unit) && + ((src_unit <= PANDAS_FR_M && dst_unit <= PANDAS_FR_M) || + (src_unit > PANDAS_FR_M && dst_unit > PANDAS_FR_M)); + + /* Enforce equality with 'no' or 'equiv' casting */ + default: + return src_unit == dst_unit; + } +} + +/* + * This provides the casting rules for the DATETIME data type units. + * + * Notably, there is a barrier between 'date units' and 'time units' + * for all but 'unsafe' casting. + */ +npy_bool +can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting) +{ + switch (casting) { + /* Allow anything with unsafe casting */ + case NPY_UNSAFE_CASTING: + return 1; + + /* + * Only enforce the 'date units' vs 'time units' barrier with + * 'same_kind' casting. + */ + case NPY_SAME_KIND_CASTING: + return (src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D); + + /* + * Enforce the 'date units' vs 'time units' barrier and that + * casting is only allowed towards more precise units with + * 'safe' casting. + */ + case NPY_SAFE_CASTING: + return (src_unit <= dst_unit) && + ((src_unit <= PANDAS_FR_D && dst_unit <= PANDAS_FR_D) || + (src_unit > PANDAS_FR_D && dst_unit > PANDAS_FR_D)); + + /* Enforce equality with 'no' or 'equiv' casting */ + default: + return src_unit == dst_unit; + } +} + +/* + * Converts a datetime based on the given metadata into a datetimestruct + */ +int +convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, + npy_datetime dt, + pandas_datetimestruct *out) +{ + npy_int64 perday; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_datetimestruct)); + out->year = 1970; + out->month = 1; + out->day = 1; + + /* TODO: Change to a mechanism that avoids the potential overflow */ + dt *= meta->num; + + /* + * Note that care must be taken with the / and % operators + * for negative values. + */ + switch (meta->base) { + case PANDAS_FR_Y: + out->year = 1970 + dt; + break; + + case PANDAS_FR_M: + if (dt >= 0) { + out->year = 1970 + dt / 12; + out->month = dt % 12 + 1; + } + else { + out->year = 1969 + (dt + 1) / 12; + out->month = 12 + (dt + 1)% 12; + } + break; + + case PANDAS_FR_W: + /* A week is 7 days */ + set_datetimestruct_days(dt * 7, out); + break; + + case PANDAS_FR_D: + set_datetimestruct_days(dt, out); + break; + + case PANDAS_FR_h: + perday = 24LL; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt; + break; + + case PANDAS_FR_m: + perday = 24LL * 60; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / 60; + out->min = dt % 60; + break; + + case PANDAS_FR_s: + perday = 24LL * 60 * 60; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60); + out->min = (dt / 60) % 60; + out->sec = dt % 60; + break; + + case PANDAS_FR_ms: + perday = 24LL * 60 * 60 * 1000; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000LL); + out->min = (dt / (60*1000LL)) % 60; + out->sec = (dt / 1000LL) % 60; + out->us = (dt % 1000LL) * 1000; + break; + + case PANDAS_FR_us: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000000LL); + out->min = (dt / (60*1000000LL)) % 60; + out->sec = (dt / 1000000LL) % 60; + out->us = dt % 1000000LL; + break; + + case PANDAS_FR_ns: + perday = 24LL * 60LL * 60LL * 1000LL * 1000LL * 1000LL; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000000000LL); + out->min = (dt / (60*1000000000LL)) % 60; + out->sec = (dt / 1000000000LL) % 60; + out->us = (dt / 1000LL) % 1000000LL; + out->ps = (dt % 1000LL) * 1000; + break; + + case PANDAS_FR_ps: + perday = 24LL * 60 * 60 * 1000 * 1000 * 1000 * 1000; + + if (dt >= 0) { + set_datetimestruct_days(dt / perday, out); + dt = dt % perday; + } + else { + set_datetimestruct_days((dt - (perday-1)) / perday, out); + dt = (perday-1) + (dt + 1) % perday; + } + out->hour = dt / (60*60*1000000000000LL); + out->min = (dt / (60*1000000000000LL)) % 60; + out->sec = (dt / 1000000000000LL) % 60; + out->us = (dt / 1000000LL) % 1000000LL; + out->ps = dt % 1000000LL; + break; + + case PANDAS_FR_fs: + /* entire range is only +- 2.6 hours */ + if (dt >= 0) { + out->hour = dt / (60*60*1000000000000000LL); + out->min = (dt / (60*1000000000000000LL)) % 60; + out->sec = (dt / 1000000000000000LL) % 60; + out->us = (dt / 1000000000LL) % 1000000LL; + out->ps = (dt / 1000LL) % 1000000LL; + out->as = (dt % 1000LL) * 1000; + } + else { + npy_datetime minutes; + + minutes = dt / (60*1000000000000000LL); + dt = dt % (60*1000000000000000LL); + if (dt < 0) { + dt += (60*1000000000000000LL); + --minutes; + } + /* Offset the negative minutes */ + add_minutes_to_datetimestruct(out, minutes); + out->sec = (dt / 1000000000000000LL) % 60; + out->us = (dt / 1000000000LL) % 1000000LL; + out->ps = (dt / 1000LL) % 1000000LL; + out->as = (dt % 1000LL) * 1000; + } + break; + + case PANDAS_FR_as: + /* entire range is only +- 9.2 seconds */ + if (dt >= 0) { + out->sec = (dt / 1000000000000000000LL) % 60; + out->us = (dt / 1000000000000LL) % 1000000LL; + out->ps = (dt / 1000000LL) % 1000000LL; + out->as = dt % 1000000LL; + } + else { + npy_datetime seconds; + + seconds = dt / 1000000000000000000LL; + dt = dt % 1000000000000000000LL; + if (dt < 0) { + dt += 1000000000000000000LL; + --seconds; + } + /* Offset the negative seconds */ + add_seconds_to_datetimestruct(out, seconds); + out->us = (dt / 1000000000000LL) % 1000000LL; + out->ps = (dt / 1000000LL) % 1000000LL; + out->as = dt % 1000000LL; + } + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy datetime metadata is corrupted with invalid " + "base unit"); + return -1; + } + + return 0; +} + diff --git a/pandas/src/datetime/np_datetime.h b/pandas/src/datetime/np_datetime.h new file mode 100644 index 00000000..281c4a56 --- /dev/null +++ b/pandas/src/datetime/np_datetime.h @@ -0,0 +1,122 @@ +/* + * This is derived from numpy 1.7 + * See NP_LICENSE.TXT + */ + +#ifndef _PANDAS_DATETIME_H_ +#define _PANDAS_DATETIME_H_ + +#include + +typedef enum { + PANDAS_FR_Y = 0, /* Years */ + PANDAS_FR_M = 1, /* Months */ + PANDAS_FR_W = 2, /* Weeks */ + /* Gap where NPY_FR_B was */ + PANDAS_FR_D = 4, /* Days */ + PANDAS_FR_h = 5, /* hours */ + PANDAS_FR_m = 6, /* minutes */ + PANDAS_FR_s = 7, /* seconds */ + PANDAS_FR_ms = 8,/* milliseconds */ + PANDAS_FR_us = 9,/* microseconds */ + PANDAS_FR_ns = 10,/* nanoseconds */ + PANDAS_FR_ps = 11,/* picoseconds */ + PANDAS_FR_fs = 12,/* femtoseconds */ + PANDAS_FR_as = 13,/* attoseconds */ + PANDAS_FR_GENERIC = 14 /* Generic, unbound units, can convert to anything */ +} PANDAS_DATETIMEUNIT; + +#define PANDAS_DATETIME_NUMUNITS 13 + +#define PANDAS_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) + +#define PANDAS_DATETIME_NAT NPY_MIN_INT64 + +typedef struct { + npy_int64 year; + npy_int32 month, day, hour, min, sec, us, ps, as; +} pandas_datetimestruct; + +typedef struct { + PANDAS_DATETIMEUNIT base; + int num; +} pandas_datetime_metadata; + +// stuff pandas needs +// ---------------------------------------------------------------------------- + +int convert_pydatetime_to_datetimestruct(PyObject *obj, pandas_datetimestruct *out, + PANDAS_DATETIMEUNIT *out_bestunit, + int apply_tzinfo); + +npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *d); + +void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, + pandas_datetimestruct *result); + +int dayofweek(int y, int m, int d); + +static int _days_per_month_table[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; + +// stuff numpy-derived code needs in header +// ---------------------------------------------------------------------------- + +int is_leapyear(npy_int64 year); + +/* + * Converts a datetime from a datetimestruct to a datetime based + * on some metadata. The date is assumed to be valid. + * + * TODO: If meta->num is really big, there could be overflow + * + * Returns 0 on success, -1 on failure. + */ +int +convert_datetimestruct_to_datetime(pandas_datetime_metadata *meta, + const pandas_datetimestruct *dts, + npy_datetime *out); + +/* + * Calculates the days offset from the 1970 epoch. + */ +npy_int64 +get_datetimestruct_days(const pandas_datetimestruct *dts); + +/* + * Adjusts a datetimestruct based on a minutes offset. Assumes + * the current values are valid. + */ +void +add_minutes_to_datetimestruct(pandas_datetimestruct *dts, int minutes); + +/* + * This provides the casting rules for the TIMEDELTA data type units. + * + * Notably, there is a barrier between the nonlinear years and + * months units, and all the other units. + */ +//npy_bool +//can_cast_timedelta64_units(PANDAS_DATETIMEUNIT src_unit, +// PANDAS_DATETIMEUNIT dst_unit, +// NPY_CASTING casting); + +npy_bool +can_cast_datetime64_units(PANDAS_DATETIMEUNIT src_unit, + PANDAS_DATETIMEUNIT dst_unit, + NPY_CASTING casting); + + +int +convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, + npy_datetime dt, + pandas_datetimestruct *out); + + +PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); + + +#endif diff --git a/pandas/src/datetime/np_datetime_strings.c b/pandas/src/datetime/np_datetime_strings.c new file mode 100644 index 00000000..85705642 --- /dev/null +++ b/pandas/src/datetime/np_datetime_strings.c @@ -0,0 +1,1456 @@ +/* + * This file implements string parsing and creation for NumPy datetime. + * + * Written by Mark Wiebe (mwwiebe@gmail.com) + * Copyright (c) 2011 by Enthought, Inc. + * + * See NP_LICENSE.txt for the license. + */ + +#define PY_SSIZE_T_CLEAN +#define NO_IMPORT + +#include + +#include + +#include +#include "numpy/arrayscalars.h" + +#include "np_datetime.h" +#include "np_datetime_strings.h" + +NPY_NO_EXPORT const char * +npy_casting_to_string(NPY_CASTING casting) +{ + switch (casting) { + case NPY_NO_CASTING: + return "'no'"; + case NPY_EQUIV_CASTING: + return "'equiv'"; + case NPY_SAFE_CASTING: + return "'safe'"; + case NPY_SAME_KIND_CASTING: + return "'same_kind'"; + case NPY_UNSAFE_CASTING: + return "'unsafe'"; + default: + return ""; + } +} + +/* Platform-specific time_t typedef */ +typedef time_t NPY_TIME_T; + +/*// We *do* want these symbols, but for cython, not for C. fine in mac osx,*/ +/*// linux complains.*/ +/*static void _suppress_unused_variable_warning(void)*/ +/*{*/ +/* int x = _days_per_month_table[0][0];*/ +/* x = x;*/ + +/* int y = _month_offset[0][0];*/ +/* y = y;*/ + +/* char *z = _datetime_strings[0];*/ +/* z = z;*/ +/*}*/ + +/* Exported as DATETIMEUNITS in multiarraymodule.c */ +static char *_datetime_strings[PANDAS_DATETIME_NUMUNITS] = { + "Y", + "M", + "W", + "D", + "h", + "m", + "s", + "ms", + "us", + "ns", + "ps", + "fs", + "as", +}; +/* + * Wraps `localtime` functionality for multiple platforms. This + * converts a time value to a time structure in the local timezone. + * + * Returns 0 on success, -1 on failure. + */ +static int +get_localtime(NPY_TIME_T *ts, struct tm *tms) +{ + char *func_name = ""; +#if defined(_WIN32) + #if defined(_MSC_VER) && (_MSC_VER >= 1400) + if (localtime_s(tms, ts) != 0) { + func_name = "localtime_s"; + goto fail; + } + #elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) + if (_localtime64_s(tms, ts) != 0) { + func_name = "_localtime64_s"; + goto fail; + } + #else + struct tm *tms_tmp; + tms_tmp = localtime(ts); + if (tms_tmp == NULL) { + func_name = "localtime"; + goto fail; + } + memcpy(tms, tms_tmp, sizeof(struct tm)); + #endif +#else + if (localtime_r(ts, tms) == NULL) { + func_name = "localtime_r"; + goto fail; + } +#endif + + return 0; + +fail: + PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " + "to a local time", func_name); + return -1; +} + +/* + * Wraps `gmtime` functionality for multiple platforms. This + * converts a time value to a time structure in UTC. + * + * Returns 0 on success, -1 on failure. + */ +static int +get_gmtime(NPY_TIME_T *ts, struct tm *tms) +{ + char *func_name = ""; +#if defined(_WIN32) + #if defined(_MSC_VER) && (_MSC_VER >= 1400) + if (gmtime_s(tms, ts) != 0) { + func_name = "gmtime_s"; + goto fail; + } + #elif defined(__GNUC__) && defined(NPY_MINGW_USE_CUSTOM_MSVCR) + if (_gmtime64_s(tms, ts) != 0) { + func_name = "_gmtime64_s"; + goto fail; + } + #else + struct tm *tms_tmp; + tms_tmp = gmtime(ts); + if (tms_tmp == NULL) { + func_name = "gmtime"; + goto fail; + } + memcpy(tms, tms_tmp, sizeof(struct tm)); + #endif +#else + if (gmtime_r(ts, tms) == NULL) { + func_name = "gmtime_r"; + goto fail; + } +#endif + + return 0; + +fail: + PyErr_Format(PyExc_OSError, "Failed to use '%s' to convert " + "to a UTC time", func_name); + return -1; +} + +/* + * Converts a datetimestruct in UTC to a datetimestruct in local time, + * also returning the timezone offset applied. + * + * Returns 0 on success, -1 on failure. + */ +static int +convert_datetimestruct_utc_to_local(pandas_datetimestruct *out_dts_local, + const pandas_datetimestruct *dts_utc, int *out_timezone_offset) +{ + NPY_TIME_T rawtime = 0, localrawtime; + struct tm tm_; + npy_int64 year_correction = 0; + + /* Make a copy of the input 'dts' to modify */ + *out_dts_local = *dts_utc; + + /* HACK: Use a year < 2038 for later years for small time_t */ + if (sizeof(NPY_TIME_T) == 4 && out_dts_local->year >= 2038) { + if (is_leapyear(out_dts_local->year)) { + /* 2036 is a leap year */ + year_correction = out_dts_local->year - 2036; + out_dts_local->year -= year_correction; + } + else { + /* 2037 is not a leap year */ + year_correction = out_dts_local->year - 2037; + out_dts_local->year -= year_correction; + } + } + + /* + * Convert everything in 'dts' to a time_t, to minutes precision. + * This is POSIX time, which skips leap-seconds, but because + * we drop the seconds value from the pandas_datetimestruct, everything + * is ok for this operation. + */ + rawtime = (time_t)get_datetimestruct_days(out_dts_local) * 24 * 60 * 60; + rawtime += dts_utc->hour * 60 * 60; + rawtime += dts_utc->min * 60; + + /* localtime converts a 'time_t' into a local 'struct tm' */ + if (get_localtime(&rawtime, &tm_) < 0) { + return -1; + } + + /* Copy back all the values except seconds */ + out_dts_local->min = tm_.tm_min; + out_dts_local->hour = tm_.tm_hour; + out_dts_local->day = tm_.tm_mday; + out_dts_local->month = tm_.tm_mon + 1; + out_dts_local->year = tm_.tm_year + 1900; + + /* Extract the timezone offset that was applied */ + rawtime /= 60; + localrawtime = (time_t)get_datetimestruct_days(out_dts_local) * 24 * 60; + localrawtime += out_dts_local->hour * 60; + localrawtime += out_dts_local->min; + + *out_timezone_offset = localrawtime - rawtime; + + /* Reapply the year 2038 year correction HACK */ + out_dts_local->year += year_correction; + + return 0; +} + +/* + * Converts a datetimestruct in local time to a datetimestruct in UTC. + * + * Returns 0 on success, -1 on failure. + */ +static int +convert_datetimestruct_local_to_utc(pandas_datetimestruct *out_dts_utc, + const pandas_datetimestruct *dts_local) +{ + npy_int64 year_correction = 0; + + /* Make a copy of the input 'dts' to modify */ + *out_dts_utc = *dts_local; + + /* HACK: Use a year < 2038 for later years for small time_t */ + if (sizeof(NPY_TIME_T) == 4 && out_dts_utc->year >= 2038) { + if (is_leapyear(out_dts_utc->year)) { + /* 2036 is a leap year */ + year_correction = out_dts_utc->year - 2036; + out_dts_utc->year -= year_correction; + } + else { + /* 2037 is not a leap year */ + year_correction = out_dts_utc->year - 2037; + out_dts_utc->year -= year_correction; + } + } + + /* + * ISO 8601 states to treat date-times without a timezone offset + * or 'Z' for UTC as local time. The C standard libary functions + * mktime and gmtime allow us to do this conversion. + * + * Only do this timezone adjustment for recent and future years. + * In this case, "recent" is defined to be 1970 and later, because + * on MS Windows, mktime raises an error when given an earlier date. + */ + if (out_dts_utc->year >= 1970) { + NPY_TIME_T rawtime = 0; + struct tm tm_; + + tm_.tm_sec = out_dts_utc->sec; + tm_.tm_min = out_dts_utc->min; + tm_.tm_hour = out_dts_utc->hour; + tm_.tm_mday = out_dts_utc->day; + tm_.tm_mon = out_dts_utc->month - 1; + tm_.tm_year = out_dts_utc->year - 1900; + tm_.tm_isdst = -1; + + /* mktime converts a local 'struct tm' into a time_t */ + rawtime = mktime(&tm_); + if (rawtime == -1) { + PyErr_SetString(PyExc_OSError, "Failed to use mktime to " + "convert local time to UTC"); + return -1; + } + + /* gmtime converts a 'time_t' into a UTC 'struct tm' */ + if (get_gmtime(&rawtime, &tm_) < 0) { + return -1; + } + out_dts_utc->sec = tm_.tm_sec; + out_dts_utc->min = tm_.tm_min; + out_dts_utc->hour = tm_.tm_hour; + out_dts_utc->day = tm_.tm_mday; + out_dts_utc->month = tm_.tm_mon + 1; + out_dts_utc->year = tm_.tm_year + 1900; + } + + /* Reapply the year 2038 year correction HACK */ + out_dts_utc->year += year_correction; + + return 0; +} + +/* int */ +/* parse_python_string(PyObject* obj, pandas_datetimestruct *dts) { */ +/* PyObject *bytes = NULL; */ +/* char *str = NULL; */ +/* Py_ssize_t len = 0; */ +/* PANDAS_DATETIMEUNIT bestunit = -1; */ + +/* /\* Convert to an ASCII string for the date parser *\/ */ +/* if (PyUnicode_Check(obj)) { */ +/* bytes = PyUnicode_AsASCIIString(obj); */ +/* if (bytes == NULL) { */ +/* return -1; */ +/* } */ +/* } */ +/* else { */ +/* bytes = obj; */ +/* Py_INCREF(bytes); */ +/* } */ +/* if (PyBytes_AsStringAndSize(bytes, &str, &len) == -1) { */ +/* Py_DECREF(bytes); */ +/* return -1; */ +/* } */ + +/* /\* Parse the ISO date *\/ */ +/* if (parse_iso_8601_datetime(str, len, PANDAS_FR_us, NPY_UNSAFE_CASTING, */ +/* dts, NULL, &bestunit, NULL) < 0) { */ +/* Py_DECREF(bytes); */ +/* return -1; */ +/* } */ +/* Py_DECREF(bytes); */ + +/* return 0; */ +/* } */ + + +/* + * Parses (almost) standard ISO 8601 date strings. The differences are: + * + * + The date "20100312" is parsed as the year 20100312, not as + * equivalent to "2010-03-12". The '-' in the dates are not optional. + * + Only seconds may have a decimal point, with up to 18 digits after it + * (maximum attoseconds precision). + * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate + * the date and the time. Both are treated equivalently. + * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats. + * + Doesn't handle leap seconds (seconds value has 60 in these cases). + * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow + * + Accepts special values "NaT" (not a time), "Today", (current + * day according to local time) and "Now" (current time in UTC). + * + * 'str' must be a NULL-terminated string, and 'len' must be its length. + * 'unit' should contain -1 if the unit is unknown, or the unit + * which will be used if it is. + * 'casting' controls how the detected unit from the string is allowed + * to be cast to the 'unit' parameter. + * + * 'out' gets filled with the parsed date-time. + * 'out_local' gets set to 1 if the parsed time was in local time, + * to 0 otherwise. The values 'now' and 'today' don't get counted + * as local, and neither do UTC +/-#### timezone offsets, because + * they aren't using the computer's local timezone offset. + * 'out_bestunit' gives a suggested unit based on the amount of + * resolution provided in the string, or -1 for NaT. + * 'out_special' gets set to 1 if the parsed time was 'today', + * 'now', or ''/'NaT'. For 'today', the unit recommended is + * 'D', for 'now', the unit recommended is 's', and for 'NaT' + * the unit recommended is 'Y'. + * + * Returns 0 on success, -1 on failure. + */ +int +parse_iso_8601_datetime(char *str, int len, + PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, + pandas_datetimestruct *out, + npy_bool *out_local, + PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special) +{ + int year_leap = 0; + int i, numdigits; + char *substr, sublen; + PANDAS_DATETIMEUNIT bestunit; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_datetimestruct)); + out->month = 1; + out->day = 1; + + + /* + * The string "today" means take today's date in local time, and + * convert it to a date representation. This date representation, if + * forced into a time unit, will be at midnight UTC. + * This is perhaps a little weird, but done so that the + * 'datetime64[D]' type produces the date you expect, rather than + * switching to an adjacent day depending on the current time and your + * timezone. + */ + if (len == 5 && tolower(str[0]) == 't' && + tolower(str[1]) == 'o' && + tolower(str[2]) == 'd' && + tolower(str[3]) == 'a' && + tolower(str[4]) == 'y') { + NPY_TIME_T rawtime = 0; + struct tm tm_; + + time(&rawtime); + if (get_localtime(&rawtime, &tm_) < 0) { + return -1; + } + out->year = tm_.tm_year + 1900; + out->month = tm_.tm_mon + 1; + out->day = tm_.tm_mday; + + bestunit = PANDAS_FR_D; + + /* + * Indicate that this was a special value, and + * is a date (unit 'D'). + */ + if (out_local != NULL) { + *out_local = 0; + } + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + if (out_special != NULL) { + *out_special = 1; + } + + /* Check the casting rule */ + if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + casting)) { + PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + "'%s' using casting rule %s", + str, _datetime_strings[unit], + npy_casting_to_string(casting)); + return -1; + } + + return 0; + } + + /* The string "now" resolves to the current UTC time */ + if (len == 3 && tolower(str[0]) == 'n' && + tolower(str[1]) == 'o' && + tolower(str[2]) == 'w') { + NPY_TIME_T rawtime = 0; + pandas_datetime_metadata meta; + + time(&rawtime); + + /* Set up a dummy metadata for the conversion */ + meta.base = PANDAS_FR_s; + meta.num = 1; + + bestunit = PANDAS_FR_s; + + /* + * Indicate that this was a special value, and + * use 's' because the time() function has resolution + * seconds. + */ + if (out_local != NULL) { + *out_local = 0; + } + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + if (out_special != NULL) { + *out_special = 1; + } + + /* Check the casting rule */ + if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + casting)) { + PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + "'%s' using casting rule %s", + str, _datetime_strings[unit], + npy_casting_to_string(casting)); + return -1; + } + + return convert_datetime_to_datetimestruct(&meta, rawtime, out); + } + + /* Anything else isn't a special value */ + if (out_special != NULL) { + *out_special = 0; + } + + substr = str; + sublen = len; + + /* Skip leading whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + } + + /* Leading '-' sign for negative year */ + if (*substr == '-') { + ++substr; + --sublen; + } + + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE YEAR (digits until the '-' character) */ + out->year = 0; + while (sublen > 0 && isdigit(*substr)) { + out->year = 10 * out->year + (*substr - '0'); + ++substr; + --sublen; + } + + /* Negate the year if necessary */ + if (str[0] == '-') { + out->year = -out->year; + } + /* Check whether it's a leap-year */ + year_leap = is_leapyear(out->year); + + /* Next character must be a '-' or the end of the string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; + } + bestunit = PANDAS_FR_Y; + goto finish; + } + else if (*substr == '-') { + ++substr; + --sublen; + } + else { + goto parse_error; + } + + /* Can't have a trailing '-' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE MONTH (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->month = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->month < 1 || out->month > 12) { + PyErr_Format(PyExc_ValueError, + "Month out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a '-' or the end of the string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; + } + bestunit = PANDAS_FR_M; + goto finish; + } + else if (*substr == '-') { + ++substr; + --sublen; + } + else { + goto parse_error; + } + + /* Can't have a trailing '-' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE DAY (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->day = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->day < 1 || + out->day > _days_per_month_table[year_leap][out->month-1]) { + PyErr_Format(PyExc_ValueError, + "Day out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a 'T', ' ', or end of string */ + if (sublen == 0) { + if (out_local != NULL) { + *out_local = 0; + } + bestunit = PANDAS_FR_D; + goto finish; + } + else if (*substr != 'T' && *substr != ' ') { + goto parse_error; + } + else { + ++substr; + --sublen; + } + + /* PARSE THE HOURS (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->hour = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->hour < 0 || out->hour >= 24) { + PyErr_Format(PyExc_ValueError, + "Hours out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a ':' or the end of the string */ + if (sublen > 0 && *substr == ':') { + ++substr; + --sublen; + } + else { + bestunit = PANDAS_FR_h; + goto parse_timezone; + } + + /* Can't have a trailing ':' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE MINUTES (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->min = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->hour < 0 || out->min >= 60) { + PyErr_Format(PyExc_ValueError, + "Minutes out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character must be a ':' or the end of the string */ + if (sublen > 0 && *substr == ':') { + ++substr; + --sublen; + } + else { + bestunit = PANDAS_FR_m; + goto parse_timezone; + } + + /* Can't have a trailing ':' */ + if (sublen == 0) { + goto parse_error; + } + + /* PARSE THE SECONDS (2 digits) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + out->sec = 10 * (substr[0] - '0') + (substr[1] - '0'); + + if (out->sec < 0 || out->sec >= 60) { + PyErr_Format(PyExc_ValueError, + "Seconds out of range in datetime string \"%s\"", str); + goto error; + } + substr += 2; + sublen -= 2; + } + else { + goto parse_error; + } + + /* Next character may be a '.' indicating fractional seconds */ + if (sublen > 0 && *substr == '.') { + ++substr; + --sublen; + } + else { + bestunit = PANDAS_FR_s; + goto parse_timezone; + } + + /* PARSE THE MICROSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->us *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->us += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; + } + } + + if (sublen == 0 || !isdigit(*substr)) { + if (numdigits > 3) { + bestunit = PANDAS_FR_us; + } + else { + bestunit = PANDAS_FR_ms; + } + goto parse_timezone; + } + + /* PARSE THE PICOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->ps *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->ps += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; + } + } + + if (sublen == 0 || !isdigit(*substr)) { + if (numdigits > 3) { + bestunit = PANDAS_FR_ps; + } + else { + bestunit = PANDAS_FR_ns; + } + goto parse_timezone; + } + + /* PARSE THE ATTOSECONDS (0 to 6 digits) */ + numdigits = 0; + for (i = 0; i < 6; ++i) { + out->as *= 10; + if (sublen > 0 && isdigit(*substr)) { + out->as += (*substr - '0'); + ++substr; + --sublen; + ++numdigits; + } + } + + if (numdigits > 3) { + bestunit = PANDAS_FR_as; + } + else { + bestunit = PANDAS_FR_fs; + } + +parse_timezone: + if (sublen == 0) { + if (convert_datetimestruct_local_to_utc(out, out) < 0) { + goto error; + } + + /* Since neither "Z" nor a time-zone was specified, it's local */ + if (out_local != NULL) { + *out_local = 1; + } + + goto finish; + } + + /* UTC specifier */ + if (*substr == 'Z') { + /* "Z" means not local */ + if (out_local != NULL) { + *out_local = 0; + } + + if (sublen == 1) { + goto finish; + } + else { + ++substr; + --sublen; + } + } + /* Time zone offset */ + else if (*substr == '-' || *substr == '+') { + int offset_neg = 0, offset_hour = 0, offset_minute = 0; + + /* + * Since "local" means local with respect to the current + * machine, we say this is non-local. + */ + if (out_local != NULL) { + *out_local = 0; + } + + if (*substr == '-') { + offset_neg = 1; + } + ++substr; + --sublen; + + /* The hours offset */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_hour = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_hour >= 24) { + PyErr_Format(PyExc_ValueError, + "Timezone hours offset out of range " + "in datetime string \"%s\"", str); + goto error; + } + } + else { + goto parse_error; + } + + /* The minutes offset is optional */ + if (sublen > 0) { + /* Optional ':' */ + if (*substr == ':') { + ++substr; + --sublen; + } + + /* The minutes offset (at the end of the string) */ + if (sublen >= 2 && isdigit(substr[0]) && isdigit(substr[1])) { + offset_minute = 10 * (substr[0] - '0') + (substr[1] - '0'); + substr += 2; + sublen -= 2; + if (offset_minute >= 60) { + PyErr_Format(PyExc_ValueError, + "Timezone minutes offset out of range " + "in datetime string \"%s\"", str); + goto error; + } + } + else { + goto parse_error; + } + } + + /* Apply the time zone offset */ + if (offset_neg) { + offset_hour = -offset_hour; + offset_minute = -offset_minute; + } + add_minutes_to_datetimestruct(out, -60 * offset_hour - offset_minute); + } + + /* Skip trailing whitespace */ + while (sublen > 0 && isspace(*substr)) { + ++substr; + --sublen; + } + + if (sublen != 0) { + goto parse_error; + } + +finish: + if (out_bestunit != NULL) { + *out_bestunit = bestunit; + } + + /* Check the casting rule */ + if (unit != -1 && !can_cast_datetime64_units(bestunit, unit, + casting)) { + PyErr_Format(PyExc_TypeError, "Cannot parse \"%s\" as unit " + "'%s' using casting rule %s", + str, _datetime_strings[unit], + npy_casting_to_string(casting)); + return -1; + } + + return 0; + +parse_error: + PyErr_Format(PyExc_ValueError, + "Error parsing datetime string \"%s\" at position %d", + str, (int)(substr-str)); + return -1; + +error: + return -1; +} + +/* + * Provides a string length to use for converting datetime + * objects with the given local and unit settings. + */ +int +get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base) +{ + int len = 0; + + /* If no unit is provided, return the maximum length */ + if (base == -1) { + return PANDAS_DATETIME_MAX_ISO8601_STRLEN; + } + + switch (base) { + /* Generic units can only be used to represent NaT */ + /*case PANDAS_FR_GENERIC:*/ + /* return 4;*/ + case PANDAS_FR_as: + len += 3; /* "###" */ + case PANDAS_FR_fs: + len += 3; /* "###" */ + case PANDAS_FR_ps: + len += 3; /* "###" */ + case PANDAS_FR_ns: + len += 3; /* "###" */ + case PANDAS_FR_us: + len += 3; /* "###" */ + case PANDAS_FR_ms: + len += 4; /* ".###" */ + case PANDAS_FR_s: + len += 3; /* ":##" */ + case PANDAS_FR_m: + len += 3; /* ":##" */ + case PANDAS_FR_h: + len += 3; /* "T##" */ + case PANDAS_FR_D: + case PANDAS_FR_W: + len += 3; /* "-##" */ + case PANDAS_FR_M: + len += 3; /* "-##" */ + case PANDAS_FR_Y: + len += 21; /* 64-bit year */ + break; + default: + len += 3; /* handle the now defunct NPY_FR_B */ + break; + } + + if (base >= PANDAS_FR_h) { + if (local) { + len += 5; /* "+####" or "-####" */ + } + else { + len += 1; /* "Z" */ + } + } + + len += 1; /* NULL terminator */ + + return len; +} + +/* + * Finds the largest unit whose value is nonzero, and for which + * the remainder for the rest of the units is zero. + */ +static PANDAS_DATETIMEUNIT +lossless_unit_from_datetimestruct(pandas_datetimestruct *dts) +{ + if (dts->as % 1000 != 0) { + return PANDAS_FR_as; + } + else if (dts->as != 0) { + return PANDAS_FR_fs; + } + else if (dts->ps % 1000 != 0) { + return PANDAS_FR_ps; + } + else if (dts->ps != 0) { + return PANDAS_FR_ns; + } + else if (dts->us % 1000 != 0) { + return PANDAS_FR_us; + } + else if (dts->us != 0) { + return PANDAS_FR_ms; + } + else if (dts->sec != 0) { + return PANDAS_FR_s; + } + else if (dts->min != 0) { + return PANDAS_FR_m; + } + else if (dts->hour != 0) { + return PANDAS_FR_h; + } + else if (dts->day != 1) { + return PANDAS_FR_D; + } + else if (dts->month != 1) { + return PANDAS_FR_M; + } + else { + return PANDAS_FR_Y; + } +} + +/* + * Converts an pandas_datetimestruct to an (almost) ISO 8601 + * NULL-terminated string. If the string fits in the space exactly, + * it leaves out the NULL terminator and returns success. + * + * The differences from ISO 8601 are the 'NaT' string, and + * the number of year digits is >= 4 instead of strictly 4. + * + * If 'local' is non-zero, it produces a string in local time with + * a +-#### timezone offset, otherwise it uses timezone Z (UTC). + * + * 'base' restricts the output to that unit. Set 'base' to + * -1 to auto-detect a base after which all the values are zero. + * + * 'tzoffset' is used if 'local' is enabled, and 'tzoffset' is + * set to a value other than -1. This is a manual override for + * the local time zone to use, as an offset in minutes. + * + * 'casting' controls whether data loss is allowed by truncating + * the data to a coarser unit. This interacts with 'local', slightly, + * in order to form a date unit string as a local time, the casting + * must be unsafe. + * + * Returns 0 on success, -1 on failure (for example if the output + * string was too short). + */ +int +make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting) +{ + pandas_datetimestruct dts_local; + int timezone_offset = 0; + + char *substr = outstr, sublen = outlen; + int tmplen; + + /* Only do local time within a reasonable year range */ + if ((dts->year <= 1800 || dts->year >= 10000) && tzoffset == -1) { + local = 0; + } + + /* Automatically detect a good unit */ + if (base == -1) { + base = lossless_unit_from_datetimestruct(dts); + /* + * If there's a timezone, use at least minutes precision, + * and never split up hours and minutes by default + */ + if ((base < PANDAS_FR_m && local) || base == PANDAS_FR_h) { + base = PANDAS_FR_m; + } + /* Don't split up dates by default */ + else if (base < PANDAS_FR_D) { + base = PANDAS_FR_D; + } + } + /* + * Print weeks with the same precision as days. + * + * TODO: Could print weeks with YYYY-Www format if the week + * epoch is a Monday. + */ + else if (base == PANDAS_FR_W) { + base = PANDAS_FR_D; + } + + /* Use the C API to convert from UTC to local time */ + if (local && tzoffset == -1) { + if (convert_datetimestruct_utc_to_local(&dts_local, dts, + &timezone_offset) < 0) { + return -1; + } + + /* Set dts to point to our local time instead of the UTC time */ + dts = &dts_local; + } + /* Use the manually provided tzoffset */ + else if (local) { + /* Make a copy of the pandas_datetimestruct we can modify */ + dts_local = *dts; + dts = &dts_local; + + /* Set and apply the required timezone offset */ + timezone_offset = tzoffset; + add_minutes_to_datetimestruct(dts, timezone_offset); + } + + /* + * Now the datetimestruct data is in the final form for + * the string representation, so ensure that the data + * is being cast according to the casting rule. + */ + if (casting != NPY_UNSAFE_CASTING) { + /* Producing a date as a local time is always 'unsafe' */ + if (base <= PANDAS_FR_D && local) { + PyErr_SetString(PyExc_TypeError, "Cannot create a local " + "timezone-based date string from a NumPy " + "datetime without forcing 'unsafe' casting"); + return -1; + } + /* Only 'unsafe' and 'same_kind' allow data loss */ + else { + PANDAS_DATETIMEUNIT unitprec; + + unitprec = lossless_unit_from_datetimestruct(dts); + if (casting != NPY_SAME_KIND_CASTING && unitprec > base) { + PyErr_Format(PyExc_TypeError, "Cannot create a " + "string with unit precision '%s' " + "from the NumPy datetime, which has data at " + "unit precision '%s', " + "requires 'unsafe' or 'same_kind' casting", + _datetime_strings[base], + _datetime_strings[unitprec]); + return -1; + } + } + } + + /* YEAR */ + /* + * Can't use PyOS_snprintf, because it always produces a '\0' + * character at the end, and NumPy string types are permitted + * to have data all the way to the end of the buffer. + */ +#ifdef _WIN32 + tmplen = _snprintf(substr, sublen, "%04" NPY_INT64_FMT, dts->year); +#else + tmplen = snprintf(substr, sublen, "%04" NPY_INT64_FMT, (long int)dts->year); +#endif + /* If it ran out of space or there isn't space for the NULL terminator */ + if (tmplen < 0 || tmplen > sublen) { + goto string_too_short; + } + substr += tmplen; + sublen -= tmplen; + + /* Stop if the unit is years */ + if (base == PANDAS_FR_Y) { + if (sublen > 0) { + *substr = '\0'; + } + return 0; + } + + /* MONTH */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->month / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->month % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is months */ + if (base == PANDAS_FR_M) { + if (sublen > 0) { + *substr = '\0'; + } + return 0; + } + + /* DAY */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = '-'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->day / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->day % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is days */ + if (base == PANDAS_FR_D) { + if (sublen > 0) { + *substr = '\0'; + } + return 0; + } + + /* HOUR */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = 'T'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->hour / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->hour % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is hours */ + if (base == PANDAS_FR_h) { + goto add_time_zone; + } + + /* MINUTE */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->min / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->min % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is minutes */ + if (base == PANDAS_FR_m) { + goto add_time_zone; + } + + /* SECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = ':'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->sec / 10) + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->sec % 10) + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is seconds */ + if (base == PANDAS_FR_s) { + goto add_time_zone; + } + + /* MILLISECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = '.'; + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 100000) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->us / 10000) % 10 + '0'); + if (sublen < 4 ) { + goto string_too_short; + } + substr[3] = (char)((dts->us / 1000) % 10 + '0'); + substr += 4; + sublen -= 4; + + /* Stop if the unit is milliseconds */ + if (base == PANDAS_FR_ms) { + goto add_time_zone; + } + + /* MICROSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->us / 100) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->us / 10) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(dts->us % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is microseconds */ + if (base == PANDAS_FR_us) { + goto add_time_zone; + } + + /* NANOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100000) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10000) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->ps / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is nanoseconds */ + if (base == PANDAS_FR_ns) { + goto add_time_zone; + } + + /* PICOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->ps / 100) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->ps / 10) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(dts->ps % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is picoseconds */ + if (base == PANDAS_FR_ps) { + goto add_time_zone; + } + + /* FEMTOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100000) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10000) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)((dts->as / 1000) % 10 + '0'); + substr += 3; + sublen -= 3; + + /* Stop if the unit is femtoseconds */ + if (base == PANDAS_FR_fs) { + goto add_time_zone; + } + + /* ATTOSECOND */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((dts->as / 100) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((dts->as / 10) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(dts->as % 10 + '0'); + substr += 3; + sublen -= 3; + +add_time_zone: + if (local) { + /* Add the +/- sign */ + if (sublen < 1) { + goto string_too_short; + } + if (timezone_offset < 0) { + substr[0] = '-'; + timezone_offset = -timezone_offset; + } + else { + substr[0] = '+'; + } + substr += 1; + sublen -= 1; + + /* Add the timezone offset */ + if (sublen < 1 ) { + goto string_too_short; + } + substr[0] = (char)((timezone_offset / (10*60)) % 10 + '0'); + if (sublen < 2 ) { + goto string_too_short; + } + substr[1] = (char)((timezone_offset / 60) % 10 + '0'); + if (sublen < 3 ) { + goto string_too_short; + } + substr[2] = (char)(((timezone_offset % 60) / 10) % 10 + '0'); + if (sublen < 4 ) { + goto string_too_short; + } + substr[3] = (char)((timezone_offset % 60) % 10 + '0'); + substr += 4; + sublen -= 4; + } + /* UTC "Zulu" time */ + else { + if (sublen < 1) { + goto string_too_short; + } + substr[0] = 'Z'; + substr += 1; + sublen -= 1; + } + + /* Add a NULL terminator, and return */ + if (sublen > 0) { + substr[0] = '\0'; + } + + return 0; + +string_too_short: + PyErr_Format(PyExc_RuntimeError, + "The string provided for NumPy ISO datetime formatting " + "was too short, with length %d", + outlen); + return -1; +} diff --git a/pandas/src/datetime/np_datetime_strings.h b/pandas/src/datetime/np_datetime_strings.h new file mode 100644 index 00000000..9a2488fe --- /dev/null +++ b/pandas/src/datetime/np_datetime_strings.h @@ -0,0 +1,86 @@ +/* + * This is derived from numpy 1.7. See NP_LICENSE.txt + */ + +#ifndef _NPY_PRIVATE__DATETIME_STRINGS_H_ +#define _NPY_PRIVATE__DATETIME_STRINGS_H_ + +/* + * Parses (almost) standard ISO 8601 date strings. The differences are: + * + * + The date "20100312" is parsed as the year 20100312, not as + * equivalent to "2010-03-12". The '-' in the dates are not optional. + * + Only seconds may have a decimal point, with up to 18 digits after it + * (maximum attoseconds precision). + * + Either a 'T' as in ISO 8601 or a ' ' may be used to separate + * the date and the time. Both are treated equivalently. + * + Doesn't (yet) handle the "YYYY-DDD" or "YYYY-Www" formats. + * + Doesn't handle leap seconds (seconds value has 60 in these cases). + * + Doesn't handle 24:00:00 as synonym for midnight (00:00:00) tomorrow + * + Accepts special values "NaT" (not a time), "Today", (current + * day according to local time) and "Now" (current time in UTC). + * + * 'str' must be a NULL-terminated string, and 'len' must be its length. + * 'unit' should contain -1 if the unit is unknown, or the unit + * which will be used if it is. + * 'casting' controls how the detected unit from the string is allowed + * to be cast to the 'unit' parameter. + * + * 'out' gets filled with the parsed date-time. + * 'out_local' gets set to 1 if the parsed time was in local time, + * to 0 otherwise. The values 'now' and 'today' don't get counted + * as local, and neither do UTC +/-#### timezone offsets, because + * they aren't using the computer's local timezone offset. + * 'out_bestunit' gives a suggested unit based on the amount of + * resolution provided in the string, or -1 for NaT. + * 'out_special' gets set to 1 if the parsed time was 'today', + * 'now', or ''/'NaT'. For 'today', the unit recommended is + * 'D', for 'now', the unit recommended is 's', and for 'NaT' + * the unit recommended is 'Y'. + * + * Returns 0 on success, -1 on failure. + */ +int +parse_iso_8601_datetime(char *str, int len, + PANDAS_DATETIMEUNIT unit, + NPY_CASTING casting, + pandas_datetimestruct *out, + npy_bool *out_local, + PANDAS_DATETIMEUNIT *out_bestunit, + npy_bool *out_special); + +/* + * Provides a string length to use for converting datetime + * objects with the given local and unit settings. + */ +int +get_datetime_iso_8601_strlen(int local, PANDAS_DATETIMEUNIT base); + +/* + * Converts an pandas_datetimestruct to an (almost) ISO 8601 + * NULL-terminated string. + * + * If 'local' is non-zero, it produces a string in local time with + * a +-#### timezone offset, otherwise it uses timezone Z (UTC). + * + * 'base' restricts the output to that unit. Set 'base' to + * -1 to auto-detect a base after which all the values are zero. + * + * 'tzoffset' is used if 'local' is enabled, and 'tzoffset' is + * set to a value other than -1. This is a manual override for + * the local time zone to use, as an offset in minutes. + * + * 'casting' controls whether data loss is allowed by truncating + * the data to a coarser unit. This interacts with 'local', slightly, + * in order to form a date unit string as a local time, the casting + * must be unsafe. + * + * Returns 0 on success, -1 on failure (for example if the output + * string was too short). + */ +int +make_iso_8601_datetime(pandas_datetimestruct *dts, char *outstr, int outlen, + int local, PANDAS_DATETIMEUNIT base, int tzoffset, + NPY_CASTING casting); + +#endif diff --git a/pandas/src/datetime_helper.h b/pandas/src/datetime_helper.h new file mode 100644 index 00000000..8be5f597 --- /dev/null +++ b/pandas/src/datetime_helper.h @@ -0,0 +1,6 @@ +#include "datetime.h" + +void mangle_nat(PyObject *val) { + PyDateTime_GET_MONTH(val) = -1; + PyDateTime_GET_DAY(val) = -1; +} diff --git a/pandas/src/engines.pyx b/pandas/src/engines.pyx new file mode 100644 index 00000000..1cd3e85f --- /dev/null +++ b/pandas/src/engines.pyx @@ -0,0 +1,526 @@ +from numpy cimport ndarray + +from numpy cimport float64_t, int32_t, int64_t, uint8_t +cimport cython + +cimport numpy as cnp + +cnp.import_array() +cnp.import_ufunc() + +cimport util + +import numpy as np + +import _algos + +# include "hashtable.pyx" + +cdef extern from "datetime.h": + bint PyDateTime_Check(object o) + void PyDateTime_IMPORT() + +PyDateTime_IMPORT + +cdef extern from "Python.h": + int PySlice_Check(object) + +# int PyList_Check(object) +# int PyTuple_Check(object) + +cdef inline is_definitely_invalid_key(object val): + if PyTuple_Check(val): + try: + hash(val) + except TypeError: + return True + + return (PySlice_Check(val) or cnp.PyArray_Check(val) + or PyList_Check(val)) + +def get_value_at(ndarray arr, object loc): + return util.get_value_at(arr, loc) + +def set_value_at(ndarray arr, object loc, object val): + return util.set_value_at(arr, loc, val) + + +# Don't populate hash tables in monotonic indexes larger than this +cdef int _SIZE_CUTOFF = 1000000 + + +cdef class IndexEngine: + + cdef readonly: + object vgetter + HashTable mapping + bint over_size_threshold + + cdef: + bint unique, monotonic + bint initialized, monotonic_check, unique_check + + def __init__(self, vgetter, n): + self.vgetter = vgetter + + self.over_size_threshold = n >= _SIZE_CUTOFF + + self.initialized = 0 + self.monotonic_check = 0 + + self.unique = 0 + self.monotonic = 0 + + def __contains__(self, object val): + self._ensure_mapping_populated() + hash(val) + return val in self.mapping + + cpdef get_value(self, ndarray arr, object key): + ''' + arr : 1-dimensional ndarray + ''' + cdef: + object loc + void* data_ptr + + loc = self.get_loc(key) + if PySlice_Check(loc) or cnp.PyArray_Check(loc): + return arr[loc] + else: + if arr.descr.type_num == NPY_DATETIME: + return Timestamp(util.get_value_at(arr, loc)) + return util.get_value_at(arr, loc) + + cpdef set_value(self, ndarray arr, object key, object value): + ''' + arr : 1-dimensional ndarray + ''' + cdef: + object loc + void* data_ptr + + loc = self.get_loc(key) + value = convert_scalar(arr, value) + + if PySlice_Check(loc) or cnp.PyArray_Check(loc): + arr[loc] = value + else: + util.set_value_at(arr, loc, value) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError + + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + loc = values.searchsorted(val, side='left') + if util.get_value_at(values, loc) != val: + raise KeyError(val) + return loc + + self._ensure_mapping_populated() + if not self.unique: + return self._get_loc_duplicates(val) + + try: + return self.mapping.get_item(val) + except TypeError: + self._check_type(val) + raise KeyError(val) + + cdef inline _get_loc_duplicates(self, object val): + cdef: + Py_ssize_t diff + + if self.is_monotonic: + values = self._get_index_values() + + left = values.searchsorted(val, side='left') + right = values.searchsorted(val, side='right') + + diff = right - left + if diff == 0: + raise KeyError(val) + elif diff == 1: + return left + else: + return slice(left, right) + else: + return self._get_bool_indexer(val) + + cdef _get_bool_indexer(self, object val): + cdef: + ndarray[uint8_t] indexer + ndarray[object] values + int count = 0 + Py_ssize_t i, n + + values = self._get_index_values() + n = len(values) + + result = np.empty(n, dtype=bool) + indexer = result.view(np.uint8) + + for i in range(n): + if values[i] == val: + count += 1 + indexer[i] = 1 + else: + indexer[i] = 0 + + if count == 0: + raise KeyError(val) + + return result + + property is_unique: + + def __get__(self): + if not self.unique_check: + self._do_unique_check() + + return self.unique == 1 + + property is_monotonic: + + def __get__(self): + if not self.monotonic_check: + self._do_monotonic_check() + + return self.monotonic == 1 + + cdef inline _do_monotonic_check(self): + try: + values = self._get_index_values() + self.monotonic, unique = self._call_monotonic(values) + + if unique is not None: + self.unique = unique + self.unique_check = 1 + + except TypeError: + self.monotonic = 0 + self.monotonic_check = 1 + + cdef _get_index_values(self): + return self.vgetter() + + cdef inline _do_unique_check(self): + self._ensure_mapping_populated() + + def _call_monotonic(self, values): + raise NotImplementedError + + cdef _make_hash_table(self, n): + raise NotImplementedError + + cdef inline _check_type(self, object val): + hash(val) + + cdef inline _ensure_mapping_populated(self): + if not self.initialized: + self.initialize() + + cdef initialize(self): + values = self._get_index_values() + + self.mapping = self._make_hash_table(len(values)) + self.mapping.map_locations(values) + + if len(self.mapping) == len(values): + self.unique = 1 + self.unique_check = 1 + + self.initialized = 1 + + def clear_mapping(self): + self.mapping = None + self.initialized = 0 + + def get_indexer(self, values): + self._ensure_mapping_populated() + return self.mapping.lookup(values) + + + +# @cache_readonly +# def _monotonicity_check(self): +# try: +# f = self._algos['is_monotonic'] +# # wrong buffer type raises ValueError +# return f(self.values) +# except TypeError: +# return False, None + + + +cdef class Int64Engine(IndexEngine): + + # cdef Int64HashTable mapping + + cdef _make_hash_table(self, n): + return Int64HashTable(n) + + def _call_monotonic(self, values): + return _algos.is_monotonic_int64(values) + + def get_pad_indexer(self, other, limit=None): + return _algos.pad_int64(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + return _algos.backfill_int64(self._get_index_values(), other, + limit=limit) + + cdef _get_bool_indexer(self, object val): + cdef: + ndarray[uint8_t, cast=True] indexer + ndarray[int64_t] values + int count = 0 + Py_ssize_t i, n + int64_t ival + + if not util.is_integer_object(val): + raise KeyError(val) + + ival = val + + values = self._get_index_values() + n = len(values) + + result = np.empty(n, dtype=bool) + indexer = result.view(np.uint8) + + for i in range(n): + if values[i] == val: + count += 1 + indexer[i] = 1 + else: + indexer[i] = 0 + + if count == 0: + raise KeyError(val) + + return result + +cdef class Float64Engine(IndexEngine): + + # cdef Float64HashTable mapping + + cdef _make_hash_table(self, n): + return Float64HashTable(n) + + def _call_monotonic(self, values): + return _algos.is_monotonic_float64(values) + + def get_pad_indexer(self, other, limit=None): + return _algos.pad_float64(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + return _algos.backfill_float64(self._get_index_values(), other, + limit=limit) + +_pad_functions = { + 'object' : _algos.pad_object, + 'int64' : _algos.pad_int64, + 'float64' : _algos.pad_float64 +} + +_backfill_functions = { + 'object': _algos.backfill_object, + 'int64': _algos.backfill_int64, + 'float64': _algos.backfill_float64 +} + +cdef class ObjectEngine(IndexEngine): + + # cdef PyObjectHashTable mapping + + cdef _make_hash_table(self, n): + return PyObjectHashTable(n) + + def _call_monotonic(self, values): + return _algos.is_monotonic_object(values) + + def get_pad_indexer(self, other, limit=None): + return _algos.pad_object(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + return _algos.backfill_object(self._get_index_values(), other, + limit=limit) + + +cdef class DatetimeEngine(Int64Engine): + + def __contains__(self, object val): + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + return util.get_value_at(values, loc) == conv + + self._ensure_mapping_populated() + return _to_i8(val) in self.mapping + + cdef _get_index_values(self): + return self.vgetter().view('i8') + + def _call_monotonic(self, values): + return _algos.is_monotonic_int64(values) + + cpdef get_loc(self, object val): + if is_definitely_invalid_key(val): + raise TypeError + + # Welcome to the spaghetti factory + + if self.over_size_threshold and self.is_monotonic: + if not self.is_unique: + return self._get_loc_duplicates(val) + values = self._get_index_values() + conv = _to_i8(val) + loc = values.searchsorted(conv, side='left') + if util.get_value_at(values, loc) != conv: + raise KeyError(val) + return loc + + self._ensure_mapping_populated() + if not self.unique: + val = _to_i8(val) + return self._get_loc_duplicates(val) + + try: + return self.mapping.get_item(val.value) + except KeyError: + raise KeyError(val) + except AttributeError: + pass + + try: + val = _to_i8(val) + return self.mapping.get_item(val) + except TypeError: + self._date_check_type(val) + raise KeyError(val) + + cdef inline _date_check_type(self, object val): + hash(val) + if not util.is_integer_object(val): + raise KeyError(val) + + def get_indexer(self, values): + self._ensure_mapping_populated() + if values.dtype != 'M8[ns]': + return np.repeat(-1, len(values)).astype('i4') + values = np.asarray(values).view('i8') + return self.mapping.lookup(values) + + def get_pad_indexer(self, other, limit=None): + if other.dtype != 'M8[ns]': + return np.repeat(-1, len(other)).astype('i4') + other = np.asarray(other).view('i8') + return _algos.pad_int64(self._get_index_values(), other, + limit=limit) + + def get_backfill_indexer(self, other, limit=None): + if other.dtype != 'M8[ns]': + return np.repeat(-1, len(other)).astype('i4') + other = np.asarray(other).view('i8') + return _algos.backfill_int64(self._get_index_values(), other, + limit=limit) + + +cpdef convert_scalar(ndarray arr, object value): + if arr.descr.type_num == NPY_DATETIME: + if isinstance(value, _Timestamp): + return (<_Timestamp> value).value + elif value is None or value != value: + return iNaT + else: + return Timestamp(value).value + + if issubclass(arr.dtype.type, (np.integer, np.bool_)): + if util.is_float_object(value) and value != value: + raise ValueError('Cannot assign nan to integer series') + + return value + +cdef inline _to_i8(object val): + cdef pandas_datetimestruct dts + try: + return val.value + except AttributeError: + if util.is_datetime64_object(val): + return get_datetime64_value(val) + elif PyDateTime_Check(val): + return _pydatetime_to_dts(val, &dts) + return val + + +# ctypedef fused idxvalue_t: +# object +# int +# float64_t +# int32_t +# int64_t + +# @cython.boundscheck(False) +# @cython.wraparound(False) +# def is_monotonic(ndarray[idxvalue_t] arr): +# ''' +# Returns +# ------- +# is_monotonic, is_unique +# ''' +# cdef: +# Py_ssize_t i, n +# idxvalue_t prev, cur +# bint is_unique = 1 + +# n = len(arr) + +# if n < 2: +# return True, True + +# prev = arr[0] +# for i in range(1, n): +# cur = arr[i] +# if cur < prev: +# return False, None +# elif cur == prev: +# is_unique = 0 +# prev = cur +# return True, is_unique + + +# @cython.wraparound(False) +# @cython.boundscheck(False) +# def groupby_index(ndarray[idxvalue_t] index, ndarray labels): +# cdef dict result = {} +# cdef Py_ssize_t i, length +# cdef list members +# cdef object idx, key + +# length = len(index) + +# for i in range(length): +# key = util.get_value_1d(labels, i) + +# if util._checknull(key): +# continue + +# idx = index[i] +# if key in result: +# members = result[key] +# members.append(idx) +# else: +# result[key] = [idx] + +# return result diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py new file mode 100644 index 00000000..d77c19e8 --- /dev/null +++ b/pandas/src/generate_code.py @@ -0,0 +1,1233 @@ +import os +from cStringIO import StringIO + +header = """ +cimport numpy as np +cimport cython + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +import numpy as np +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +from khash cimport * + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + +""" + + +take_1d_template = """@cython.wraparound(False) +def take_1d_%(name)s(ndarray[%(c_type)s] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[%(c_type)s] outbuf + %(c_type)s fv + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + if %(raise_on_na)s and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i] = values[idx] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + outbuf[i] = fv + else: + outbuf[i] = values[idx] + +""" + +take_2d_axis0_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[%(c_type)s, ndim=2] outbuf + %(c_type)s fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if %(raise_on_na)s and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +""" + +take_2d_axis1_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[%(c_type)s, ndim=2] outbuf + %(c_type)s fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if %(raise_on_na)s and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +""" + +take_2d_multi_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[%(c_type)s, ndim=2] outbuf + %(c_type)s fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if %(raise_on_na)s and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +""" + + +def set_na(na ="NaN"): + return "outbuf[i] = %s" % na + +def set_na_2d(na = "NaN"): + return "outbuf[i, j] = %s" % na + +raise_on_na = "raise ValueError('No NA values allowed')" + +''' +Backfilling logic for generating fill vector + +Diagram of what's going on + +Old New Fill vector Mask + . 0 1 + . 0 1 + . 0 1 +A A 0 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 + . 1 1 +B B 1 1 + . 2 1 + . 2 1 + . 2 1 +C C 2 1 + . 0 + . 0 +D +''' + +backfill_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef %(c_type)s cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +""" + + +pad_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def pad_%(name)s(ndarray[%(c_type)s] old, ndarray[%(c_type)s] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef %(c_type)s cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +""" + +pad_1d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_%(name)s(ndarray[%(c_type)s] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +""" + +pad_2d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +""" + +backfill_2d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_%(name)s(ndarray[%(c_type)s, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +""" + +backfill_1d_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_%(name)s(ndarray[%(c_type)s] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef %(c_type)s val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +""" + +is_monotonic_template = """@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_%(name)s(ndarray[%(c_type)s] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + %(c_type)s prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +""" + +map_indices_template = """@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_%(name)s(ndarray[%(c_type)s] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +""" + +groupby_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_%(name)s(ndarray[%(c_type)s] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +""" + +arrmap_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_%(name)s(ndarray[%(c_type)s] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +""" + +#---------------------------------------------------------------------- +# Joins on ordered, unique indices + +# right might contain non-unique values + +left_join_unique_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + %(c_type)s lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +""" + +# @cython.wraparound(False) +# @cython.boundscheck(False) + +left_join_template = """ +def left_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +""" + + +inner_join_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +""" + + +outer_join_template2 = """@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +""" + +outer_join_template = """@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_%(name)s(ndarray[%(c_type)s] left, + ndarray[%(c_type)s] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + %(c_type)s lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[%(c_type)s] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + i += 1 + count += 1 + break + else: + if left[i] == right[j]: + i += 1 + j += 1 + elif left[i] < right[j]: + i += 1 + else: + j += 1 + + count += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=%(dtype)s) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + while True: + if i == nleft: + if j == nright: + # we are done + break + else: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + j += 1 + count += 1 + break + elif j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + else: + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + i += 1 + j += 1 + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + j += 1 + + count += 1 + + return result, lindexer, rindexer + +""" + +# ensure_dtype functions + +ensure_dtype_template = """ +cpdef ensure_%(name)s(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_%(ctype)s: + return arr + else: + return arr.astype(np.%(dtype)s) + else: + return np.array(arr, dtype=np.%(dtype)s) + +""" + +ensure_functions = [ + ('float64', 'FLOAT64', 'float64'), + ('int32', 'INT32', 'int32'), + ('int64', 'INT64', 'int64'), + # ('platform_int', 'INT', 'int_'), + ('object', 'OBJECT', 'object_'), +] + +def generate_ensure_dtypes(): + output = StringIO() + for name, ctype, dtype in ensure_functions: + filled = ensure_dtype_template % locals() + output.write(filled) + return output.getvalue() + +#---------------------------------------------------------------------- +# Fast "put" logic for speeding up interleaving logic + +put2d_template = """ +def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values, + ndarray[int64_t] indexer, Py_ssize_t loc, + ndarray[%(dest_type2)s] out): + cdef: + Py_ssize_t i, j, k + + k = len(values) + for j from 0 <= j < k: + i = indexer[j] + out[i] = values[j, loc] +""" + + +#------------------------------------------------------------------------- +# Generators + +def generate_put_functions(): + function_list = [ + ('float64', 'float64_t', 'object'), + ('float64', 'float64_t', 'float64_t'), + ('object', 'object', 'object'), + ('int32', 'int32_t', 'int64_t'), + ('int32', 'int32_t', 'float64_t'), + ('int32', 'int32_t', 'object'), + ('int64', 'int64_t', 'int64_t'), + ('int64', 'int64_t', 'float64_t'), + ('int64', 'int64_t', 'object'), + ('bool', 'uint8_t', 'uint8_t'), + ('bool', 'uint8_t', 'object') + ] + + output = StringIO() + for name, c_type, dest_type in function_list: + func = put2d_template % {'name' : name, 'c_type' : c_type, + 'dest_type' : dest_type.replace('_t', ''), + 'dest_type2' : dest_type} + output.write(func) + return output.getvalue() + + +# name, ctype, capable of holding NA +function_list = [ + ('float64', 'float64_t', 'np.float64', True), + ('object', 'object', 'object', True), + ('int32', 'int32_t', 'np.int32', False), + ('int64', 'int64_t', 'np.int64', False), + ('bool', 'uint8_t', 'np.bool', False) +] + +def generate_from_template(template, ndim=1, exclude=None): + output = StringIO() + for name, c_type, dtype, can_hold_na in function_list: + if exclude is not None and name in exclude: + continue + + func = template % {'name': name, 'c_type': c_type, + 'dtype': dtype, + 'raise_on_na': 'False' if can_hold_na else 'True'} + output.write(func) + return output.getvalue() + +templates_1d = [map_indices_template, + pad_template, + backfill_template, + pad_1d_template, + backfill_1d_template, + pad_2d_template, + backfill_2d_template, + take_1d_template, + is_monotonic_template, + groupby_template, + arrmap_template] + +nobool_1d_templates = [left_join_unique_template, + left_join_template, + outer_join_template2, + inner_join_template] + +templates_2d = [take_2d_axis0_template, + take_2d_axis1_template, + take_2d_multi_template] + +def generate_take_cython_file(path='generated.pyx'): + with open(path, 'w') as f: + print >> f, header + + print >> f, generate_ensure_dtypes() + + for template in templates_1d: + print >> f, generate_from_template(template) + + for template in templates_2d: + print >> f, generate_from_template(template, ndim=2) + + # for template in templates_1d_datetime: + # print >> f, generate_from_template_datetime(template) + + # for template in templates_2d_datetime: + # print >> f, generate_from_template_datetime(template, ndim=2) + + for template in nobool_1d_templates: + print >> f, generate_from_template(template, exclude=['bool']) + +if __name__ == '__main__': + generate_take_cython_file() diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx new file mode 100644 index 00000000..d9439410 --- /dev/null +++ b/pandas/src/generated.pyx @@ -0,0 +1,4017 @@ + +cimport numpy as np +cimport cython + +from numpy cimport * + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +import numpy as np +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +from khash cimport * + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cdef int PLATFORM_INT = ( np.arange(0, dtype=np.int_)).descr.type_num + +cpdef ensure_platform_int(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == PLATFORM_INT: + return arr + else: + return arr.astype(np.int_) + else: + return np.array(arr, dtype=np.int_) + + + +cpdef ensure_float64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_FLOAT64: + return arr + else: + return arr.astype(np.float64) + else: + return np.array(arr, dtype=np.float64) + + +cpdef ensure_int32(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT32: + return arr + else: + return arr.astype(np.int32) + else: + return np.array(arr, dtype=np.int32) + + +cpdef ensure_int64(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_INT64: + return arr + else: + return arr.astype(np.int64) + else: + return np.array(arr, dtype=np.int64) + + +cpdef ensure_object(object arr): + if util.is_array(arr): + if ( arr).descr.type_num == NPY_OBJECT: + return arr + else: + return arr.astype(np.object_) + else: + return np.array(arr, dtype=np.object_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_float64(ndarray[float64_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_object(ndarray[object] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int32(ndarray[int32_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_int64(ndarray[int64_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef map_indices_bool(ndarray[uint8_t] index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i in range(length): + result[index[i]] = i + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, next + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: + return indexer + + i = j = 0 + + cur = old[0] + + while j <= nright - 1 and new[j] < cur: + j += 1 + + while True: + if j == nright: + break + + if i == nleft - 1: + while j < nright: + if new[j] == cur: + indexer[j] = i + elif new[j] > cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + break + + next = old[i + 1] + + while j < nright and cur <= new[j] < next: + if new[j] == cur: + indexer[j] = i + elif fill_count < lim: + indexer[j] = i + fill_count += 1 + j += 1 + + fill_count = 0 + i += 1 + cur = next + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_float64(ndarray[float64_t] old, ndarray[float64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef float64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_object(ndarray[object] old, ndarray[object] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef object cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int32(ndarray[int32_t] old, ndarray[int32_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int32_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_int64(ndarray[int64_t] old, ndarray[int64_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef int64_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_bool(ndarray[uint8_t] old, ndarray[uint8_t] new, + limit=None): + cdef Py_ssize_t i, j, nleft, nright + cdef ndarray[int64_t, ndim=1] indexer + cdef uint8_t cur, prev + cdef int lim, fill_count = 0 + + nleft = len(old) + nright = len(new) + indexer = np.empty(nright, dtype=np.int64) + indexer.fill(-1) + + if limit is None: + lim = nright + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: + return indexer + + i = nleft - 1 + j = nright - 1 + + cur = old[nleft - 1] + + while j >= 0 and new[j] > cur: + j -= 1 + + while True: + if j < 0: + break + + if i == 0: + while j >= 0: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + break + + prev = old[i - 1] + + while j >= 0 and prev < new[j] <= cur: + if new[j] == cur: + indexer[j] = i + elif new[j] < cur and fill_count < lim: + indexer[j] = i + fill_count += 1 + j -= 1 + + fill_count = 0 + i -= 1 + cur = prev + + return indexer + + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[0] + for i in range(N): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_float64(ndarray[float64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef float64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_object(ndarray[object] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef object val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int32(ndarray[int32_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int32_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_int64(ndarray[int64_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef int64_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_inplace_bool(ndarray[uint8_t] values, + ndarray[uint8_t, cast=True] mask, + limit=None): + cdef Py_ssize_t i, N + cdef uint8_t val + cdef int lim, fill_count = 0 + + N = len(values) + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + val = values[N - 1] + for i in range(N - 1, -1 , -1): + if mask[i]: + if fill_count >= lim: + continue + fill_count += 1 + values[i] = val + else: + fill_count = 0 + val = values[i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def pad_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, 0] + for i in range(N): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_float64(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef float64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_object(ndarray[object, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef object val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int32(ndarray[int32_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int32_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_int64(ndarray[int64_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef int64_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] +@cython.boundscheck(False) +@cython.wraparound(False) +def backfill_2d_inplace_bool(ndarray[uint8_t, ndim=2] values, + ndarray[uint8_t, ndim=2] mask, + limit=None): + cdef Py_ssize_t i, j, N, K + cdef uint8_t val + cdef int lim, fill_count = 0 + + K, N = ( values).shape + + if limit is None: + lim = N + else: + if limit < 0: + raise ValueError('Limit must be non-negative') + lim = limit + + for j in range(K): + fill_count = 0 + val = values[j, N - 1] + for i in range(N - 1, -1 , -1): + if mask[j, i]: + if fill_count >= lim: + continue + fill_count += 1 + values[j, i] = val + else: + fill_count = 0 + val = values[j, i] + +@cython.wraparound(False) +def take_1d_float64(ndarray[float64_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[float64_t] outbuf + float64_t fv + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i] = values[idx] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + outbuf[i] = fv + else: + outbuf[i] = values[idx] + +@cython.wraparound(False) +def take_1d_object(ndarray[object] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[object] outbuf + object fv + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i] = values[idx] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + outbuf[i] = fv + else: + outbuf[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int32(ndarray[int32_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[int32_t] outbuf + int32_t fv + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i] = values[idx] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + outbuf[i] = fv + else: + outbuf[i] = values[idx] + +@cython.wraparound(False) +def take_1d_int64(ndarray[int64_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[int64_t] outbuf + int64_t fv + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i] = values[idx] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + outbuf[i] = fv + else: + outbuf[i] = values[idx] + +@cython.wraparound(False) +def take_1d_bool(ndarray[uint8_t] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, n, idx + ndarray[uint8_t] outbuf + uint8_t fv + + n = len(indexer) + + if out is None: + outbuf = np.empty(n, dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i] = values[idx] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + outbuf[i] = fv + else: + outbuf[i] = values[idx] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_float64(ndarray[float64_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + float64_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_object(ndarray[object] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + object prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int32(ndarray[int32_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int32_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_int64(ndarray[int64_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + int64_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique +@cython.boundscheck(False) +@cython.wraparound(False) +def is_monotonic_bool(ndarray[uint8_t] arr): + ''' + Returns + ------- + is_monotonic, is_unique + ''' + cdef: + Py_ssize_t i, n + uint8_t prev, cur + bint is_unique = 1 + + n = len(arr) + + if n < 2: + return True, True + + prev = arr[0] + for i in range(1, n): + cur = arr[i] + if cur < prev: + return False, None + elif cur == prev: + is_unique = 0 + prev = cur + return True, is_unique + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_float64(ndarray[float64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_object(ndarray[object] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int32(ndarray[int32_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_int64(ndarray[int64_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def groupby_bool(ndarray[uint8_t] index, ndarray labels): + cdef dict result = {} + cdef Py_ssize_t i, length + cdef list members + cdef object idx, key + + length = len(index) + + for i in range(length): + key = util.get_value_1d(labels, i) + + if _checknull(key): + continue + + idx = index[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_float64(ndarray[float64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_object(ndarray[object] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int32(ndarray[int32_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_int64(ndarray[int64_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def arrmap_bool(ndarray[uint8_t] index, object func): + cdef Py_ssize_t length = index.shape[0] + cdef Py_ssize_t i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + from pandas.lib import maybe_convert_objects + + for i in range(length): + result[i] = func(index[i]) + + return maybe_convert_objects(result) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + float64_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + object fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + int32_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + int64_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis0_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + uint8_t fv + + n = len(indexer) + k = values.shape[1] + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for i in range(n): + idx = indexer[i] + if idx == -1: + for j from 0 <= j < k: + raise ValueError('No NA values allowed') + else: + for j from 0 <= j < k: + outbuf[i, j] = values[idx, j] + else: + fv = fill_value + for i in range(n): + idx = indexer[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + outbuf[i, j] = values[idx, j] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + float64_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_object(ndarray[object, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + object fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if False and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + int32_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + int64_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_axis1_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] indexer, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + uint8_t fv + + n = len(values) + k = len(indexer) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + if True and _checknan(fill_value): + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + raise ValueError('No NA values allowed') + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + else: + fv = fill_value + for j in range(k): + idx = indexer[j] + + if idx == -1: + for i in range(n): + outbuf[i, j] = fv + else: + for i in range(n): + outbuf[i, j] = values[i, idx] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_float64(ndarray[float64_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[float64_t, ndim=2] outbuf + float64_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if False and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_object(ndarray[object, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[object, ndim=2] outbuf + object fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if False and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int32(ndarray[int32_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int32_t, ndim=2] outbuf + int32_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_int64(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[int64_t, ndim=2] outbuf + int64_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_2d_multi_bool(ndarray[uint8_t, ndim=2] values, + ndarray[int64_t] idx0, + ndarray[int64_t] idx1, + out=None, fill_value=np.nan): + cdef: + Py_ssize_t i, j, k, n, idx + ndarray[uint8_t, ndim=2] outbuf + uint8_t fv + + n = len(idx0) + k = len(idx1) + + if out is None: + outbuf = np.empty((n, k), dtype=values.dtype) + else: + outbuf = out + + + if True and _checknan(fill_value): + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + raise ValueError('No NA values allowed') + else: + for j in range(k): + if idx1[j] == -1: + raise ValueError('No NA values allowed') + else: + outbuf[i, j] = values[idx, idx1[j]] + else: + fv = fill_value + for i in range(n): + idx = idx0[i] + if idx == -1: + for j in range(k): + outbuf[i, j] = fv + else: + for j in range(k): + if idx1[j] == -1: + outbuf[i, j] = fv + else: + outbuf[i, j] = values[idx, idx1[j]] + + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + float64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + object lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int32_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_indexer_unique_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nleft, nright + ndarray[int64_t] indexer + int64_t lval, rval + + i = 0 + j = 0 + nleft = len(left) + nright = len(right) + + indexer = np.empty(nleft, dtype=np.int64) + while True: + if i == nleft: + break + + if j == nright: + indexer[i] = -1 + i += 1 + continue + + rval = right[j] + + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + + if left[i] == right[j]: + indexer[i] = j + i += 1 + while i < nleft - 1 and left[i] == rval: + indexer[i] = j + i += 1 + j += 1 + elif left[i] > rval: + indexer[i] = -1 + j += 1 + else: + indexer[i] = -1 + i += 1 + return indexer + + + +def left_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_object(ndarray[object] left, + ndarray[object] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +def left_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0: + while i < nleft: + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + i += 1 + count += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_object(ndarray[object] left, + ndarray[object] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def outer_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + cdef: + Py_ssize_t i, j, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft == 0: + count = nright + elif nright == 0: + count = nleft + else: + while True: + if i == nleft: + count += nright - j + break + if j == nright: + count += nleft - i + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + count += 1 + i += 1 + else: + count += 1 + j += 1 + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + # do it again, but populate the indexers / result + + i = 0 + j = 0 + count = 0 + if nleft == 0: + for j in range(nright): + lindexer[j] = -1 + rindexer[j] = j + result[j] = right[j] + elif nright == 0: + for i in range(nright): + lindexer[i] = i + rindexer[i] = -1 + result[i] = left[i] + else: + while True: + if i == nleft: + while j < nright: + lindexer[count] = -1 + rindexer[count] = j + result[count] = right[j] + count += 1 + j += 1 + break + if j == nright: + while i < nleft: + lindexer[count] = i + rindexer[count] = -1 + result[count] = left[i] + count += 1 + i += 1 + break + + lval = left[i] + rval = right[j] + + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = lval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + lindexer[count] = i + rindexer[count] = -1 + result[count] = lval + count += 1 + i += 1 + else: + lindexer[count] = -1 + rindexer[count] = j + result[count] = rval + count += 1 + j += 1 + + return result, lindexer, rindexer + + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_float64(ndarray[float64_t] left, + ndarray[float64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + float64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[float64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.float64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_object(ndarray[object] left, + ndarray[object] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + object lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[object] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=object) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int32(ndarray[int32_t] left, + ndarray[int32_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int32_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int32_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int32) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + +@cython.wraparound(False) +@cython.boundscheck(False) +def inner_join_indexer_int64(ndarray[int64_t] left, + ndarray[int64_t] right): + ''' + Two-pass algorithm for monotonic indexes. Handles many-to-one merges + ''' + cdef: + Py_ssize_t i, j, k, nright, nleft, count + int64_t lval, rval + ndarray[int64_t] lindexer, rindexer + ndarray[int64_t] result + + nleft = len(left) + nright = len(right) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + # do it again now that result size is known + + lindexer = np.empty(count, dtype=np.int64) + rindexer = np.empty(count, dtype=np.int64) + result = np.empty(count, dtype=np.int64) + + i = 0 + j = 0 + count = 0 + if nleft > 0 and nright > 0: + while True: + if i == nleft: + break + if j == nright: + break + + lval = left[i] + rval = right[j] + if lval == rval: + lindexer[count] = i + rindexer[count] = j + result[count] = rval + count += 1 + if i < nleft - 1: + if j < nright - 1 and right[j + 1] == rval: + j += 1 + else: + i += 1 + if left[i] != rval: + j += 1 + elif j < nright - 1: + j += 1 + if lval != right[j]: + i += 1 + else: + # end of the road + break + elif lval < rval: + i += 1 + else: + j += 1 + + return result, lindexer, rindexer + + diff --git a/pandas/src/groupby.pyx b/pandas/src/groupby.pyx new file mode 100644 index 00000000..7bb22393 --- /dev/null +++ b/pandas/src/groupby.pyx @@ -0,0 +1,1489 @@ +#------------------------------------------------------------------------------- +# Groupby-related functions + +@cython.boundscheck(False) +def arrmap(ndarray[object] index, object func): + cdef int length = index.shape[0] + cdef int i = 0 + + cdef ndarray[object] result = np.empty(length, dtype=np.object_) + + for i from 0 <= i < length: + result[i] = func(index[i]) + + return result + +@cython.boundscheck(False) +def groupby_func(object index, object mapper): + cdef dict result = {} + cdef ndarray[object] mapped_index + cdef ndarray[object] index_buf + cdef ndarray[int8_t] mask + cdef int i, length + cdef list members + cdef object idx, key + + length = len(index) + + index_buf = np.asarray(index) + mapped_index = arrmap(index_buf, mapper) + mask = isnullobj(mapped_index) + + for i from 0 <= i < length: + if mask[i]: + continue + + key = mapped_index[i] + idx = index_buf[i] + if key in result: + members = result[key] + members.append(idx) + else: + result[key] = [idx] + + return result + + +def func_groupby_indices(object index, object mapper): + return groupby_indices_naive(arrmap(index, mapper)) + +@cython.boundscheck(False) +cpdef groupby_indices_naive(ndarray[object] values): + cdef dict result + cdef ndarray[int8_t] mask + cdef Py_ssize_t i, length = len(values) + cdef object key + + result = {} + mask = isnullobj(values) + for i from 0 <= i < length: + if mask[i]: + continue + + key = values[i] + if key in result: + ( result[key]).append(i) + else: + result[key] = [i] + + return result + +@cython.boundscheck(False) +def groupby_indices(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels, counts, arr, seen + int64_t loc + dict ids = {} + object val + int64_t k + + ids, labels, counts = group_labels(values) + seen = np.zeros_like(counts) + + # try not to get in trouble here... + cdef int64_t **vecs = malloc(len(ids) * sizeof(int64_t*)) + result = {} + for i from 0 <= i < len(counts): + arr = np.empty(counts[i], dtype=np.int64) + result[ids[i]] = arr + vecs[i] = arr.data + + for i from 0 <= i < n: + k = labels[i] + + # was NaN + if k == -1: + continue + + loc = seen[k] + vecs[k][loc] = i + seen[k] = loc + 1 + + free(vecs) + + return result + +@cython.wraparound(False) +@cython.boundscheck(False) +def is_lexsorted(list list_of_arrays): + cdef: + int i + Py_ssize_t n, nlevels + int64_t k, cur, pre + ndarray arr + + nlevels = len(list_of_arrays) + n = len(list_of_arrays[0]) + + cdef int64_t **vecs = malloc(nlevels * sizeof(int64_t*)) + for i from 0 <= i < nlevels: + # vecs[i] = ( list_of_arrays[i]).data + + arr = list_of_arrays[i] + vecs[i] = arr.data + # assume uniqueness?? + + for i from 1 <= i < n: + for k from 0 <= k < nlevels: + cur = vecs[k][i] + pre = vecs[k][i-1] + if cur == pre: + continue + elif cur > pre: + break + else: + return False + free(vecs) + return True + +@cython.wraparound(False) +@cython.boundscheck(False) +def group_labels(ndarray[object] values): + ''' + Compute label vector from input values and associated useful data + + Returns + ------- + ''' + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + ndarray[int64_t] counts = np.empty(n, dtype=np.int64) + dict ids = {}, reverse = {} + int64_t idx + object val + int64_t count = 0 + + for i from 0 <= i < n: + val = values[i] + + # is NaN + if val != val: + labels[i] = -1 + continue + + # for large number of groups, not doing try: except: makes a big + # difference + if val in ids: + idx = ids[val] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + ids[val] = count + reverse[count] = val + labels[i] = count + counts[count] = 1 + count += 1 + + return reverse, labels, counts[:count].copy() + + +@cython.wraparound(False) +@cython.boundscheck(False) +def get_unique_labels(ndarray[object] values, dict idMap): + cdef int i, length + cdef object idx + cdef ndarray[int64_t] fillVec + length = len(values) + fillVec = np.empty(length, dtype=np.int64) + for i from 0 <= i < length: + idx = values[i] + fillVec[i] = idMap[idx] + + return fillVec + +@cython.boundscheck(False) +@cython.wraparound(False) +def groupsort_indexer(ndarray[int64_t] index, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, loc, label, n + ndarray[int64_t] counts, where, result + + # count group sizes, location 0 for NA + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + for i from 0 <= i < n: + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + where = np.zeros(ngroups + 1, dtype=np.int64) + for i from 1 <= i < ngroups + 1: + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + result = np.zeros(n, dtype=np.int64) + for i from 0 <= i < n: + label = index[i] + 1 + result[where[label]] = i + where[label] += 1 + + return result, counts + +# TODO: aggregate multiple columns in single pass + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + prodx[lab, j] *= val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + prodx[lab, 0] *= val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] + +#---------------------------------------------------------------------- +# first, nth, last + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_nth_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins, int64_t rank): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if nobs[b, j] == rank: + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] resx + ndarray[int64_t, ndim=2] nobs + + nobs = np.zeros(( out).shape, dtype=np.int64) + resx = np.empty_like(out) + + N, K = ( values).shape + + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + resx[lab, j] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_last_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] resx, nobs + + nobs = np.zeros_like(out) + resx = np.empty_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + resx[b, j] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = resx[i, j] + +#---------------------------------------------------------------------- +# group_min, group_max + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_min(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val < minx[lab, j]: + minx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val < minx[lab, 0]: + minx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_max(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + if val > maxx[lab, j]: + maxx[lab, j] = val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[lab, 0] += 1 + if val > maxx[lab, 0]: + maxx[lab, 0] = val + + for i in range(len(counts)): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_mean(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + else: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + + for i in range(len(counts)): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_var(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] labels): + cdef: + Py_ssize_t i, j, N, K, lab + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + N, K = ( values).shape + + if K > 1: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[lab, j] += 1 + sumx[lab, j] += val + sumxx[lab, j] += val * val + else: + for i in range(N): + + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + val = values[i, 0] + # not nan + if val == val: + nobs[lab, 0] += 1 + sumx[lab, 0] += val + sumxx[lab, 0] += val * val + + + for i in range(len(counts)): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) + +# TODO: could do even better if we know something about the data. eg, index has +# 1-min data, binner has 5-min data, then bins are just strides in index. This +# is a general, O(max(len(values), len(binner))) method. + +@cython.boundscheck(False) +@cython.wraparound(False) +def generate_bins_dt64(ndarray[int64_t] values, ndarray[int64_t] binner, + object closed='left'): + """ + Int64 (datetime64) version of generic python version in groupby.py + """ + cdef: + Py_ssize_t lenidx, lenbin, i, j, bc, vc + ndarray[int64_t] bins + int64_t l_bin, r_bin + bint right_closed = closed == 'right' + + lenidx = len(values) + lenbin = len(binner) + + if lenidx <= 0 or lenbin <= 0: + raise ValueError("Invalid length for values or for binner") + + # check binner fits data + if values[0] < binner[0]: + raise ValueError("Values falls before first bin") + + if values[lenidx-1] > binner[lenbin-1]: + raise ValueError("Values falls after last bin") + + bins = np.empty(lenbin - 1, dtype=np.int64) + + j = 0 # index into values + bc = 0 # bin count + + # linear scan + for i in range(0, lenbin - 1): + l_bin = binner[i] + r_bin = binner[i+1] + + # count values in current bin, advance to next bin + while j < lenidx and (values[j] < r_bin or + (right_closed and values[j] == r_bin)): + j += 1 + + bins[bc] = j + bc += 1 + + return bins + +# add passing bin edges, instead of labels + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_add_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b, nbins + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_prod_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] prodx, nobs + + nobs = np.zeros_like(out) + prodx = np.ones_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + prodx[b, j] *= val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + prodx[b, 0] *= val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = prodx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_min_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] minx, nobs + + nobs = np.zeros_like(out) + + minx = np.empty_like(out) + minx.fill(np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val < minx[b, j]: + minx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val < minx[b, 0]: + minx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = minx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_max_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] maxx, nobs + + nobs = np.zeros_like(out) + maxx = np.empty_like(out) + maxx.fill(-np.inf) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + if val > maxx[b, j]: + maxx[b, j] = val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + if val > maxx[b, 0]: + maxx[b, 0] = val + + for i in range(ngroups): + for j in range(K): + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = maxx[i, j] + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_ohlc(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + ''' + Only aggregates on axis=0 + ''' + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + float64_t vopen, vhigh, vlow, vclose, NA + bint got_first = 0 + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + if out.shape[1] != 4: + raise ValueError('Output array must have 4 columns') + + NA = np.nan + + b = 0 + if K > 1: + raise NotImplementedError + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + b += 1 + got_first = 0 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + if not got_first: + got_first = 1 + vopen = val + vlow = val + vhigh = val + else: + if val < vlow: + vlow = val + if val > vhigh: + vhigh = val + vclose = val + + if not got_first: + out[b, 0] = NA + out[b, 1] = NA + out[b, 2] = NA + out[b, 3] = NA + else: + out[b, 0] = vopen + out[b, 1] = vhigh + out[b, 2] = vlow + out[b, 3] = vclose + + +# @cython.boundscheck(False) +# @cython.wraparound(False) +def group_mean_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, count + ndarray[float64_t, ndim=2] sumx, nobs + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + + N, K = ( values).shape + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + + for i in range(ngroups): + for j in range(K): + count = nobs[i, j] + if nobs[i, j] == 0: + out[i, j] = nan + else: + out[i, j] = sumx[i, j] / count + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_var_bin(ndarray[float64_t, ndim=2] out, + ndarray[int64_t] counts, + ndarray[float64_t, ndim=2] values, + ndarray[int64_t] bins): + + cdef: + Py_ssize_t i, j, N, K, ngroups, b + float64_t val, ct + ndarray[float64_t, ndim=2] nobs, sumx, sumxx + + nobs = np.zeros_like(out) + sumx = np.zeros_like(out) + sumxx = np.zeros_like(out) + + if bins[len(bins) - 1] == len(values): + ngroups = len(bins) + else: + ngroups = len(bins) + 1 + + N, K = ( values).shape + + b = 0 + if K > 1: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + + for j in range(K): + val = values[i, j] + + # not nan + if val == val: + nobs[b, j] += 1 + sumx[b, j] += val + sumxx[b, j] += val * val + else: + for i in range(N): + while b < ngroups - 1 and i >= bins[b]: + b += 1 + + counts[b] += 1 + val = values[i, 0] + + # not nan + if val == val: + nobs[b, 0] += 1 + sumx[b, 0] += val + sumxx[b, 0] += val * val + + for i in range(ngroups): + for j in range(K): + ct = nobs[i, j] + if ct < 2: + out[i, j] = nan + else: + out[i, j] = ((ct * sumxx[i, j] - sumx[i, j] * sumx[i, j]) / + (ct * ct - ct)) + + + +@cython.boundscheck(False) +@cython.wraparound(False) +def row_bool_subset(ndarray[float64_t, ndim=2] values, + ndarray[uint8_t, cast=True] mask): + cdef: + Py_ssize_t i, j, n, k, pos = 0 + ndarray[float64_t, ndim=2] out + + n, k = ( values).shape + assert(n == len(mask)) + + out = np.empty((mask.sum(), k), dtype=np.float64) + + for i in range(n): + if mask[i]: + for j in range(k): + out[pos, j] = values[i, j] + pos += 1 + + return out + + + +def group_count(ndarray[int64_t] values, Py_ssize_t size): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] counts + + counts = np.zeros(size, dtype=np.int64) + for i in range(n): + counts[values[i]] += 1 + return counts + +def lookup_values(ndarray[object] values, dict mapping): + cdef: + Py_ssize_t i, n = len(values) + + result = np.empty(n, dtype='O') + for i in range(n): + result[i] = mapping[values[i]] + return maybe_convert_objects(result) + + +def count_level_1d(ndarray[uint8_t, cast=True] mask, + ndarray[int64_t] labels, Py_ssize_t max_bin): + cdef: + Py_ssize_t i, n + ndarray[int64_t] counts + + counts = np.zeros(max_bin, dtype='i8') + + n = len(mask) + + for i from 0 <= i < n: + if mask[i]: + counts[labels[i]] += 1 + + return counts + + +def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, + ndarray[int64_t] labels, Py_ssize_t max_bin): + cdef: + Py_ssize_t i, j, k, n + ndarray[int64_t, ndim=2] counts + + n, k = ( mask).shape + counts = np.zeros((max_bin, k), dtype='i8') + + for i from 0 <= i < n: + for j from 0 <= j < k: + if mask[i, j]: + counts[labels[i], j] += 1 + + return counts + +cdef class _PandasNull: + + def __richcmp__(_PandasNull self, object other, int op): + if op == 2: # == + return isinstance(other, _PandasNull) + elif op == 3: # != + return not isinstance(other, _PandasNull) + else: + return False + + def __hash__(self): + return 0 + +pandas_null = _PandasNull() + +def fast_zip_fillna(list ndarrays, fill_value=pandas_null): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' + cdef: + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup + + k = len(ndarrays) + n = len(ndarrays[0]) + + result = np.empty(n, dtype=object) + + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + if val != val: + val = fill_value + + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + if val != val: + val = fill_value + + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result + +def duplicated(ndarray[object] values, take_last=False): + cdef: + Py_ssize_t i, n + dict seen = {} + object row + + n = len(values) + cdef ndarray[uint8_t] result = np.zeros(n, dtype=np.uint8) + + if take_last: + for i from n > i >= 0: + row = values[i] + + if row in seen: + result[i] = 1 + else: + seen[row] = None + result[i] = 0 + else: + for i from 0 <= i < n: + row = values[i] + if row in seen: + result[i] = 1 + else: + seen[row] = None + result[i] = 0 + + return result.view(np.bool_) + +def generate_slices(ndarray[int64_t] labels, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, group_size, n, lab, start + object slobj + ndarray[int64_t] starts + + n = len(labels) + + starts = np.zeros(ngroups, dtype=np.int64) + ends = np.zeros(ngroups, dtype=np.int64) + + start = 0 + group_size = 0 + for i in range(n): + group_size += 1 + lab = labels[i] + if i == n - 1 or lab != labels[i + 1]: + starts[lab] = start + ends[lab] = start + group_size + start += group_size + group_size = 0 + + return starts, ends + + +def groupby_arrays(ndarray index, ndarray[int64_t] labels, sort=True): + cdef: + Py_ssize_t i, lab, cur, start, n = len(index) + dict result = {} + + index = np.asarray(index) + + # this is N log N. If this is a bottleneck may we worth fixing someday + if sort: + indexer = labels.argsort(kind='mergesort') + + labels = labels.take(indexer) + index = index.take(indexer) + + if n == 0: + return result + + start = 0 + cur = labels[0] + for i in range(1, n): + lab = labels[i] + + if lab != cur: + if lab != -1: + result[cur] = index[start:i] + start = i + cur = lab + + result[cur] = index[start:] + return result + +def indices_fast(object index, ndarray[int64_t] labels, list keys, + list sorted_labels): + cdef: + Py_ssize_t i, j, k, lab, cur, start, n = len(labels) + dict result = {} + object tup + + k = len(keys) + + if n == 0: + return result + + start = 0 + cur = labels[0] + for i in range(1, n): + lab = labels[i] + + if lab != cur: + if lab != -1: + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][i-1]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + + result[tup] = index[start:i] + start = i + cur = lab + + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][n - 1]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + result[tup] = index[start:] + + return result diff --git a/pandas/src/hashtable.pxd b/pandas/src/hashtable.pxd new file mode 100644 index 00000000..951e9383 --- /dev/null +++ b/pandas/src/hashtable.pxd @@ -0,0 +1,51 @@ +from khash cimport * + +# prototypes for sharing + +# cdef class StringHashTable: +# cdef kh_str_t *table + +# cdef inline int check_type(self, object) +# cpdef get_item(self, object) +# cpdef set_item(self, object, Py_ssize_t) + +# cdef class Int32HashTable: +# cdef kh_int32_t *table + +# cdef inline int check_type(self, object) +# cpdef get_item(self, int32_t) +# cpdef set_item(self, int32_t, Py_ssize_t) + +# cdef class Int64HashTable: +# cdef kh_int64_t *table + +# cdef inline bint has_key(self, int64_t) +# cpdef get_item(self, int64_t) +# cpdef set_item(self, int64_t, Py_ssize_t) + + +# cdef class Float64HashTable: +# cdef kh_float64_t *table + +# cpdef get_labels(self, ndarray, list, Py_ssize_t, int32_t) + + +# cdef class PyObjectHashTable: +# cdef kh_pymap_t *table + +# cdef destroy(self) +# cpdef get_item(self, object) +# cpdef set_item(self, object, Py_ssize_t) +# cpdef get_labels(self, ndarray, list, Py_ssize_t, int32_t) + + +# cdef class Factorizer: +# cdef public PyObjectHashTable table +# cdef public uniques +# cdef public Py_ssize_t count + + +# cdef class Int64Factorizer: +# cdef public Int64HashTable table +# cdef public list uniques +# cdef public Py_ssize_t count diff --git a/pandas/src/hashtable.pyx b/pandas/src/hashtable.pyx new file mode 100644 index 00000000..b1b254d0 --- /dev/null +++ b/pandas/src/hashtable.pyx @@ -0,0 +1,1028 @@ +from cpython cimport PyObject + +from khash cimport * +from numpy cimport * + +from util cimport _checknan +cimport util + +import numpy as np + +ONAN = np.nan + + +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr + + +cdef extern from "kvec.h": + + ctypedef struct kv_int64_t: + size_t n, m + int64_t* a + + ctypedef struct kv_object_t: + size_t n, m + PyObject** a + + inline void kv_object_push(kv_object_t *v, PyObject* x) + inline void kv_object_destroy(kv_object_t *v) + inline void kv_int64_push(kv_int64_t *v, int64_t x) + + +cdef class ObjectVector: + + cdef: + kv_object_t vec + + def __array__(self): + """ Here we use the __array__ method, that is called when numpy + tries to get an array from the object.""" + cdef npy_intp shape[1] + shape[0] = self.vec.n + + # Create a 1D array, of length 'size' + return PyArray_SimpleNewFromData(1, shape, np.NPY_OBJECT, self.vec.a) + + cdef inline append(self, object o): + kv_object_push(&self.vec, o) + + def __dealloc__(self): + kv_object_destroy(&self.vec) + + +cdef class Int64Vector: + + cdef: + kv_int64_t vec + + def __array__(self): + """ Here we use the __array__ method, that is called when numpy + tries to get an array from the object.""" + cdef npy_intp shape[1] + shape[0] = self.vec.n + + # Create a 1D array, of length 'size' + return PyArray_SimpleNewFromData(1, shape, np.NPY_INT64, + self.vec.a) + + cdef inline append(self, int64_t x): + kv_int64_push(&self.vec, x) + + def __dealloc__(self): + free(self.vec.a) + + +cdef class HashTable: + pass + + +cdef class StringHashTable(HashTable): + cdef kh_str_t *table + + # def __init__(self, size_hint=1): + # if size_hint is not None: + # kh_resize_str(self.table, size_hint) + + def __cinit__(self, int size_hint=1): + self.table = kh_init_str() + if size_hint is not None: + kh_resize_str(self.table, size_hint) + + def __dealloc__(self): + kh_destroy_str(self.table) + + cdef inline int check_type(self, object val): + return util.is_string_object(val) + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_str(self.table, util.get_c_string(val)) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_str(self.table, util.get_c_string(key)) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + buf = util.get_c_string(key) + + k = kh_put_str(self.table, buf, &ret) + self.table.keys[k] = key + if kh_exist_str(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def get_indexer(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + char *buf + int64_t *resbuf = labels.data + khiter_t k + kh_str_t *table = self.table + + for i in range(n): + buf = util.get_c_string(values[i]) + k = kh_get_str(table, buf) + if k != table.n_buckets: + resbuf[i] = table.vals[k] + else: + resbuf[i] = -1 + return labels + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + list uniques = [] + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k == self.table.n_buckets: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + if not ret: + kh_del_str(self.table, k) + count += 1 + uniques.append(val) + + # return None + return uniques + + def factorize(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + ndarray[int64_t] counts = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + object val + char *buf + khiter_t k + + for i in range(n): + val = values[i] + buf = util.get_c_string(val) + k = kh_get_str(self.table, buf) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + k = kh_put_str(self.table, buf, &ret) + # print 'putting %s, %s' % (val, count) + if not ret: + kh_del_str(self.table, k) + + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + counts[count] = 1 + count += 1 + + # return None + return reverse, labels, counts[:count].copy() + +cdef class Int32HashTable(HashTable): + cdef kh_int32_t *table + + def __init__(self, size_hint=1): + if size_hint is not None: + kh_resize_int32(self.table, size_hint) + + def __cinit__(self): + self.table = kh_init_int32() + + def __dealloc__(self): + kh_destroy_int32(self.table) + + cdef inline int check_type(self, object val): + return util.is_string_object(val) + + cpdef get_item(self, int32_t val): + cdef khiter_t k + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int32_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int32_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int32(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int32(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int32_t val + khiter_t k + + for i in range(n): + val = values[i] + k = kh_put_int32(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int32_t val + khiter_t k + ndarray[int32_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def factorize(self, ndarray[int32_t] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels = np.empty(n, dtype=np.int64) + ndarray[int64_t] counts = np.empty(n, dtype=np.int64) + dict reverse = {} + Py_ssize_t idx, count = 0 + int ret = 0 + int32_t val + khiter_t k + + for i in range(n): + val = values[i] + k = kh_get_int32(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + k = kh_put_int32(self.table, val, &ret) + if not ret: + kh_del_int32(self.table, k) + self.table.vals[k] = count + reverse[count] = val + labels[i] = count + counts[count] = 1 + count += 1 + + # return None + return reverse, labels, counts[:count].copy() + +cdef class Int64HashTable(HashTable): + cdef kh_int64_t *table + + def __init__(self, size_hint=1): + if size_hint is not None: + kh_resize_int64(self.table, size_hint) + + def __cinit__(self): + self.table = kh_init_int64() + + def __dealloc__(self): + kh_destroy_int64(self.table) + + def __contains__(self, object key): + cdef khiter_t k + k = kh_get_int64(self.table, key) + return k != self.table.n_buckets + + def __len__(self): + return self.table.size + + cdef inline bint has_key(self, int64_t val): + cdef khiter_t k + k = kh_get_int64(self.table, val) + return k != self.table.n_buckets + + cpdef get_item(self, int64_t val): + cdef khiter_t k + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, int64_t key, Py_ssize_t iterations): + cdef Py_ssize_t i, val=0 + for i in range(iterations): + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, int64_t key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + + k = kh_put_int64(self.table, key, &ret) + self.table.keys[k] = key + if kh_exist_int64(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map(self, ndarray[int64_t] keys, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t key + khiter_t k + + for i in range(n): + key = keys[i] + k = kh_put_int64(self.table, key, &ret) + self.table.vals[k] = values[i] + + def map_locations(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + + for i in range(n): + val = values[i] + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def lookup_i4(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + int64_t val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def factorize(self, ndarray[object] values): + reverse = {} + labels, counts = self.get_labels(values, reverse, 0) + return reverse, labels, counts + + def get_labels(self, ndarray[int64_t] values, list uniques, + Py_ssize_t count_prior, Py_ssize_t na_sentinel): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + ndarray[int64_t] counts + Py_ssize_t idx, count = count_prior + int ret = 0 + int64_t val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + counts = np.empty(count_prior + n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + counts[count] = 1 + count += 1 + + return labels, counts[:count].copy() + + def get_labels_groupby(self, ndarray[int64_t] values, list uniques): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + Py_ssize_t idx, count = 0 + int ret = 0 + int64_t val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + + # specific for groupby + if val < 0: + labels[i] = -1 + continue + + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + count += 1 + + return labels + + def unique(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + ndarray result + int64_t val + khiter_t k + Int64Vector uniques = Int64Vector() + + # TODO: kvec + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k == self.table.n_buckets: + k = kh_put_int64(self.table, val, &ret) + uniques.append(val) + count += 1 + + result = np.array(uniques, copy=False) + result.base = uniques + Py_INCREF(uniques) + + return result + +def value_count_int64(ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + kh_int64_t *table + int ret = 0 + list uniques = [] + + table = kh_init_int64() + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + + # for (k = kh_begin(h); k != kh_end(h); ++k) + # if (kh_exist(h, k)) kh_value(h, k) = 1; + i = 0 + result_keys = np.empty(table.n_occupied, dtype=np.int64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_int64(table) + + return result_keys, result_counts + +cdef class Float64HashTable(HashTable): + cdef kh_float64_t *table + + def __init__(self, size_hint=1): + if size_hint is not None: + kh_resize_float64(self.table, size_hint) + + def __cinit__(self): + self.table = kh_init_float64() + + def __len__(self): + return self.table.size + + def __dealloc__(self): + kh_destroy_float64(self.table) + + def factorize(self, ndarray[float64_t] values): + uniques = [] + labels, counts = self.get_labels(values, uniques, 0, -1) + return uniques, labels, counts + + cpdef get_labels(self, ndarray[float64_t] values, list uniques, + Py_ssize_t count_prior, int64_t na_sentinel): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + ndarray[int64_t] counts + Py_ssize_t idx, count = count_prior + int ret = 0 + float64_t val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + counts = np.empty(count_prior + n, dtype=np.int64) + + for i in range(n): + val = values[i] + + if val != val: + labels[i] = na_sentinel + continue + + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + k = kh_put_float64(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + counts[count] = 1 + count += 1 + + return labels, counts[:count].copy() + + def map_locations(self, ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + khiter_t k + + for i in range(n): + k = kh_put_float64(self.table, values[i], &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + float64_t val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + k = kh_get_float64(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def unique(self, ndarray[float64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + float64_t val + khiter_t k + list uniques = [] + bint seen_na = 0 + + # TODO: kvec + + for i in range(n): + val = values[i] + + if val == val: + k = kh_get_float64(self.table, val) + if k == self.table.n_buckets: + k = kh_put_float64(self.table, val, &ret) + uniques.append(val) + count += 1 + elif not seen_na: + seen_na = 1 + uniques.append(ONAN) + + return uniques + +cdef class PyObjectHashTable(HashTable): + cdef kh_pymap_t *table + + def __init__(self, size_hint=1): + self.table = kh_init_pymap() + kh_resize_pymap(self.table, size_hint) + + def __dealloc__(self): + if self.table is not NULL: + self.destroy() + + def __len__(self): + return self.table.size + + def __contains__(self, object key): + cdef khiter_t k + hash(key) + k = kh_get_pymap(self.table, key) + return k != self.table.n_buckets + + cpdef destroy(self): + kh_destroy_pymap(self.table) + self.table = NULL + + cpdef get_item(self, object val): + cdef khiter_t k + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + return self.table.vals[k] + else: + raise KeyError(val) + + def get_iter_test(self, object key, Py_ssize_t iterations): + cdef Py_ssize_t i, val + for i in range(iterations): + k = kh_get_pymap(self.table, key) + if k != self.table.n_buckets: + val = self.table.vals[k] + + cpdef set_item(self, object key, Py_ssize_t val): + cdef: + khiter_t k + int ret = 0 + char* buf + + hash(key) + k = kh_put_pymap(self.table, key, &ret) + # self.table.keys[k] = key + if kh_exist_pymap(self.table, k): + self.table.vals[k] = val + else: + raise KeyError(key) + + def map_locations(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + + for i in range(n): + val = values[i] + hash(val) + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = i + + def lookup(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + locs[i] = self.table.vals[k] + else: + locs[i] = -1 + + return locs + + def lookup2(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + long hval + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + # for i in range(n): + # val = values[i] + # hval = PyObject_Hash(val) + # k = kh_get_pymap(self.table, val) + + return locs + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = 0 + int ret = 0 + object val + ndarray result + khiter_t k + ObjectVector uniques = ObjectVector() + bint seen_na = 0 + + for i in range(n): + val = values[i] + hash(val) + if not _checknan(val): + k = kh_get_pymap(self.table, val) + if k == self.table.n_buckets: + k = kh_put_pymap(self.table, val, &ret) + uniques.append(val) + elif not seen_na: + seen_na = 1 + uniques.append(ONAN) + + result = np.array(uniques, copy=False) + result.base = uniques + Py_INCREF(uniques) + + return result + + cpdef get_labels(self, ndarray[object] values, list uniques, + Py_ssize_t count_prior, int64_t na_sentinel): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + ndarray[int64_t] counts + Py_ssize_t idx, count = count_prior + int ret = 0 + object val + khiter_t k + + labels = np.empty(n, dtype=np.int64) + counts = np.empty(count_prior + n, dtype=np.int64) + + for i in range(n): + val = values[i] + hash(val) + + if val != val or val is None: + labels[i] = na_sentinel + continue + + k = kh_get_pymap(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + k = kh_put_pymap(self.table, val, &ret) + self.table.vals[k] = count + uniques.append(val) + labels[i] = count + counts[count] = 1 + count += 1 + + return labels, counts[:count].copy() + + +cdef class Factorizer: + cdef public PyObjectHashTable table + cdef public uniques + cdef public Py_ssize_t count + + def __init__(self, size_hint): + self.table = PyObjectHashTable(size_hint) + self.uniques = [] + self.count = 0 + + def get_count(self): + return self.count + + def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1): + labels, counts = self.table.get_labels(values, self.uniques, + self.count, na_sentinel) + + # sort on + if sort: + if labels.dtype != np.int_: + labels = labels.astype(np.int_) + + sorter = list_to_object_array(self.uniques).argsort() + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + labels = reverse_indexer.take(labels) + counts = counts.take(sorter) + + self.count = len(counts) + return labels, counts + + def unique(self, ndarray[object] values): + # just for fun + return self.table.unique(values) + + +cdef class Int64Factorizer: + cdef public Int64HashTable table + cdef public list uniques + cdef public Py_ssize_t count + + def __init__(self, size_hint): + self.table = Int64HashTable(size_hint) + self.uniques = [] + self.count = 0 + + def get_count(self): + return self.count + + def factorize(self, ndarray[int64_t] values, sort=False, + na_sentinel=-1): + labels, counts = self.table.get_labels(values, self.uniques, + self.count, na_sentinel) + + # sort on + if sort: + if labels.dtype != np.int_: + labels = labels.astype(np.int_) + + sorter = list_to_object_array(self.uniques).argsort() + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + labels = reverse_indexer.take(labels) + counts = counts.take(sorter) + + self.count = len(counts) + return labels, counts + + +cdef class DictFactorizer: + + cdef public: + dict table + list uniques + Py_ssize_t count + + def __init__(self, table=None, uniques=None): + if table is None: + self.table = {} + else: + self.table = table + + if uniques is None: + self.uniques = [] + self.count = 0 + else: + self.uniques = uniques + self.count = len(uniques) + + def get_count(self): + return self.count + + def get_labels(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int64_t] labels + ndarray[int64_t] counts + Py_ssize_t idx, count = self.count + int ret = 0 + object val + + labels = np.empty(n, dtype=np.int64) + counts = np.empty(count + n, dtype=np.int64) + + for i in range(n): + val = values[i] + + if val in self.table: + idx = self.table[val] + labels[i] = idx + counts[idx] = counts[idx] + 1 + else: + self.table[val] = count + self.uniques.append(val) + labels[i] = count + counts[count] = 1 + count += 1 + + return labels, counts[:count].copy() + + def factorize(self, ndarray[object] values, sort=False): + labels, counts = self.get_labels(values) + + # sort on + if sort: + if labels.dtype != np.int_: + labels = labels.astype(np.int_) + + sorter = list_to_object_array(self.uniques).argsort() + reverse_indexer = np.empty(len(sorter), dtype=np.int_) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + labels = reverse_indexer.take(labels) + counts = counts.take(sorter) + + self.count = len(counts) + return labels, counts + + def unique(self, ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = self.count + object val + + for i in range(n): + val = values[i] + if val not in self.table: + self.table[val] = count + self.uniques.append(val) + count += 1 + return self.uniques + + + def unique_int64(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + Py_ssize_t idx, count = self.count + int64_t val + + for i in range(n): + val = values[i] + if val not in self.table: + self.table[val] = count + self.uniques.append(val) + count += 1 + return self.uniques + +def lookup2(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + int ret = 0 + object val + khiter_t k + long hval + ndarray[int64_t] locs = np.empty(n, dtype=np.int64) + + # for i in range(n): + # val = values[i] + # hval = PyObject_Hash(val) + # k = kh_get_pymap(self.table, val) + + return locs + diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx new file mode 100644 index 00000000..ee4f6726 --- /dev/null +++ b/pandas/src/inference.pyx @@ -0,0 +1,765 @@ +cimport util + +_TYPE_MAP = { + np.int8: 'integer', + np.int16: 'integer', + np.int32: 'integer', + np.int64: 'integer', + np.uint8: 'integer', + np.uint16: 'integer', + np.uint32: 'integer', + np.uint64: 'integer', + np.float32: 'floating', + np.float64: 'floating', + np.complex128: 'complex', + np.complex128: 'complex', + np.string_: 'string', + np.unicode_: 'unicode', + np.bool_: 'boolean', + np.datetime64 : 'datetime64' +} + +try: + _TYPE_MAP[np.float128] = 'floating' + _TYPE_MAP[np.complex256] = 'complex' + _TYPE_MAP[np.float16] = 'floating' + _TYPE_MAP[np.datetime64] = 'datetime64' +except AttributeError: + pass + +def infer_dtype(object _values): + cdef: + Py_ssize_t i, n + object val + ndarray values + + if isinstance(_values, np.ndarray): + values = _values + else: + if not isinstance(_values, list): + _values = list(_values) + values = list_to_object_array(_values) + + n = len(values) + if n == 0: + return 'empty' + + val_kind = values.dtype.type + if val_kind in _TYPE_MAP: + return _TYPE_MAP[val_kind] + + if values.dtype != np.object_: + values = values.astype('O') + + val = util.get_value_1d(values, 0) + + if util.is_datetime64_object(val): + if is_datetime64_array(values): + return 'datetime64' + elif util.is_integer_object(val): + if is_integer_array(values): + return 'integer' + return 'mixed-integer' + elif is_datetime(val): + if is_datetime_array(values): + return 'datetime' + + elif is_date(val): + if is_date_array(values): + return 'date' + + elif is_time(val): + if is_time_array(values): + return 'time' + + elif util.is_float_object(val): + if is_float_array(values): + return 'floating' + + elif util.is_bool_object(val): + if is_bool_array(values): + return 'boolean' + + elif PyString_Check(val): + if is_string_array(values): + return 'string' + + elif PyUnicode_Check(val): + if is_unicode_array(values): + return 'unicode' + + for i in range(n): + val = util.get_value_1d(values, i) + if util.is_integer_object(val): + return 'mixed-integer' + + return 'mixed' + +def infer_dtype_list(list values): + cdef: + Py_ssize_t i, n = len(values) + pass + + +cdef inline bint is_datetime(object o): + return PyDateTime_Check(o) + +cdef inline bint is_date(object o): + return PyDate_Check(o) + +cdef inline bint is_time(object o): + return PyTime_Check(o) + +def is_bool_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.bool_): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not util.is_bool_object(objbuf[i]): + return False + return True + else: + return False + +def is_integer(object o): + return util.is_integer_object(o) + +def is_integer_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.integer): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not util.is_integer_object(objbuf[i]): + return False + return True + else: + return False + +def is_float_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.floating): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not util.is_float_object(objbuf[i]): + return False + return True + else: + return False + +def is_string_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, (np.string_, np.unicode_)): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not PyString_Check(objbuf[i]): + return False + return True + else: + return False + +def is_unicode_array(ndarray values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] objbuf + object obj + + if issubclass(values.dtype.type, np.unicode_): + return True + elif values.dtype == np.object_: + objbuf = values + + if n == 0: + return False + + for i in range(n): + if not PyUnicode_Check(objbuf[i]): + return False + return True + else: + return False + + +def is_datetime_array(ndarray[object] values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not is_datetime(values[i]): + return False + return True + + +def is_datetime64_array(ndarray values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not util.is_datetime64_object(values[i]): + return False + return True + +def is_date_array(ndarray[object] values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not is_date(values[i]): + return False + return True + +def is_time_array(ndarray[object] values): + cdef int i, n = len(values) + if n == 0: + return False + for i in range(n): + if not is_time(values[i]): + return False + return True + +def maybe_convert_numeric(ndarray[object] values, set na_values, + convert_empty=True): + ''' + Type inference function-- convert strings to numeric (potentially) and + convert to proper dtype array + ''' + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[complex128_t] complexes + ndarray[int64_t] ints + bint seen_float = 0 + bint seen_complex = 0 + object val + float64_t fval + + n = len(values) + + floats = np.empty(n, dtype='f8') + complexes = np.empty(n, dtype='c16') + ints = np.empty(n, dtype='i8') + + for i from 0 <= i < n: + val = values[i] + + if util.is_float_object(val): + floats[i] = complexes[i] = val + seen_float = 1 + elif val in na_values: + floats[i] = complexes[i] = nan + seen_float = 1 + elif val is None: + floats[i] = complexes[i] = nan + seen_float = 1 + elif len(val) == 0: + if convert_empty: + floats[i] = complexes[i] = nan + seen_float = 1 + else: + raise ValueError('Empty string encountered') + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 + else: + fval = util.floatify(val) + floats[i] = fval + if not seen_float: + if '.' in val: + seen_float = 1 + else: + ints[i] = fval + + if seen_complex: + return complexes + elif seen_float: + return floats + else: + return ints + +def maybe_convert_objects(ndarray[object] objects, bint try_float=0, + bint safe=0, bint convert_datetime=0): + ''' + Type inference function-- convert object array to proper dtype + ''' + cdef: + Py_ssize_t i, n + ndarray[float64_t] floats + ndarray[complex128_t] complexes + ndarray[int64_t] ints + ndarray[uint8_t] bools + ndarray[int64_t] idatetimes + bint seen_float = 0 + bint seen_complex = 0 + bint seen_datetime = 0 + bint seen_int = 0 + bint seen_bool = 0 + bint seen_object = 0 + bint seen_null = 0 + object val, onan + float64_t fval, fnan + + n = len(objects) + + floats = np.empty(n, dtype='f8') + complexes = np.empty(n, dtype='c16') + ints = np.empty(n, dtype='i8') + bools = np.empty(n, dtype=np.uint8) + datetimes = np.empty(n, dtype='M8[ns]') + idatetimes = datetimes.view(np.int64) + + onan = np.nan + fnan = np.nan + + for i from 0 <= i < n: + val = objects[i] + + if val is None: + seen_null = 1 + floats[i] = complexes[i] = fnan + elif util.is_bool_object(val): + seen_bool = 1 + bools[i] = val + elif util.is_integer_object(val): + seen_int = 1 + floats[i] = val + complexes[i] = val + if not seen_null: + ints[i] = val + elif util.is_float_object(val): + floats[i] = complexes[i] = val + seen_float = 1 + elif util.is_complex_object(val): + complexes[i] = val + seen_complex = 1 + elif util.is_datetime64_object(val): + if convert_datetime: + idatetimes[i] = convert_to_tsobject(val).value + seen_datetime = 1 + else: + seen_object = 1 + # objects[i] = val.astype('O') + elif PyDateTime_Check(val): + if convert_datetime: + seen_datetime = 1 + idatetimes[i] = convert_to_tsobject(val).value + else: + seen_object = 1 + elif try_float and not util.is_string_object(val): + # this will convert Decimal objects + try: + floats[i] = float(val) + complexes[i] = complex(val) + seen_float = 1 + except Exception: + seen_object = 1 + else: + seen_object = 1 + + if not safe: + if seen_null: + if (seen_float or seen_int) and not seen_object: + if seen_complex: + return complexes + else: + return floats + else: + return objects + else: + if seen_object: + return objects + elif not seen_bool: + if seen_datetime: + if seen_complex or seen_float or seen_int: + return objects + else: + return datetimes + else: + if seen_complex: + return complexes + elif seen_float: + return floats + elif seen_int: + return ints + else: + if not seen_float and not seen_int: + return bools.view(np.bool_) + + return objects + else: + # don't cast int to float, etc. + if seen_null: + if (seen_float or seen_int) and not seen_object: + if seen_complex: + return complexes + else: + return floats + else: + return objects + else: + if seen_object: + return objects + elif not seen_bool: + if seen_datetime: + if seen_complex or seen_float or seen_int: + return objects + else: + return datetimes + else: + if seen_int and seen_float: + return objects + elif seen_complex: + return complexes + elif seen_float: + return floats + elif seen_int: + return ints + else: + if not seen_float and not seen_int: + return bools.view(np.bool_) + + return objects + + +def convert_sql_column(x): + return maybe_convert_objects(x, try_float=1) + +def try_parse_dates(ndarray[object] values, parser=None, + dayfirst=False): + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import datetime + + n = len(values) + result = np.empty(n, dtype='O') + + if parser is None: + try: + from dateutil.parser import parse + parse_date = lambda x: parse(x, dayfirst=dayfirst) + except ImportError: # pragma: no cover + def parse_date(s): + try: + return datetime.strptime(s, '%m/%d/%Y') + except Exception: + return s + # EAFP here + try: + for i from 0 <= i < n: + result[i] = parse_date(values[i]) + except Exception: + # failed + return values + else: + parse_date = parser + + try: + for i from 0 <= i < n: + result[i] = parse_date(values[i]) + except Exception: + # raise if passed parser and it failed + raise + + return result + +def try_parse_date_and_time(ndarray[object] dates, ndarray[object] times, + date_parser=None, time_parser=None, + dayfirst=False): + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import date, time, datetime + + n = len(dates) + if len(times) != n: + raise ValueError('Length of dates and times must be equal') + result = np.empty(n, dtype='O') + + if date_parser is None: + try: + from dateutil.parser import parse + parse_date = lambda x: parse(x, dayfirst=dayfirst) + except ImportError: # pragma: no cover + def parse_date(s): + try: + return date.strptime(s, '%m/%d/%Y') + except Exception: + return s + else: + parse_date = date_parser + + if time_parser is None: + try: + from dateutil.parser import parse + parse_time = lambda x: parse(x) + except ImportError: # pragma: no cover + def parse_time(s): + try: + return time.strptime(s, '%H:%M:%S') + except Exception: + return s + + else: + parse_time = time_parser + + for i from 0 <= i < n: + d = parse_date(dates[i]) + t = parse_time(times[i]) + result[i] = datetime(d.year, d.month, d.day, + t.hour, t.minute, t.second) + + return result + + +def try_parse_year_month_day(ndarray[object] years, ndarray[object] months, + ndarray[object] days): + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import datetime + + n = len(years) + if len(months) != n or len(days) != n: + raise ValueError('Length of years/months/days must all be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i])) + + return result + +def try_parse_datetime_components(ndarray[object] years, ndarray[object] months, + ndarray[object] days, ndarray[object] hours, ndarray[object] minutes, + ndarray[object] seconds): + + cdef: + Py_ssize_t i, n + ndarray[object] result + + from datetime import datetime + + n = len(years) + if (len(months) != n and len(days) != n and len(hours) != n and + len(minutes) != n and len(seconds) != n): + raise ValueError('Length of all datetime components must be equal') + result = np.empty(n, dtype='O') + + for i from 0 <= i < n: + result[i] = datetime(int(years[i]), int(months[i]), int(days[i]), + int(hours[i]), int(minutes[i]), int(seconds[i])) + + return result + +def sanitize_objects(ndarray[object] values, set na_values, + convert_empty=True): + cdef: + Py_ssize_t i, n + object val, onan + Py_ssize_t na_count = 0 + dict memo = {} + + n = len(values) + onan = np.nan + + for i from 0 <= i < n: + val = values[i] + if (convert_empty and val == '') or (val in na_values): + values[i] = onan + na_count += 1 + elif val in memo: + values[i] = memo[val] + else: + memo[val] = val + + return na_count + +def maybe_convert_bool(ndarray[object] arr): + cdef: + Py_ssize_t i, n + ndarray[uint8_t] result + object val + + n = len(arr) + result = np.empty(n, dtype=np.uint8) + + for i from 0 <= i < n: + val = arr[i] + + if val == 'True' or type(val) == bool and val: + result[i] = 1 + elif val == 'False' or type(val) == bool and not val: + result[i] = 0 + else: + return arr + + return result.view(np.bool_) + + +def map_infer(ndarray arr, object f, bint convert=1): + ''' + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + ''' + cdef: + Py_ssize_t i, n + ndarray[object] result + object val + + n = len(arr) + result = np.empty(n, dtype=object) + for i in range(n): + val = f(util.get_value_at(arr, i)) + + # unbox 0-dim arrays, GH #690 + if is_array(val) and PyArray_NDIM(val) == 0: + # is there a faster way to unbox? + val = val.item() + + result[i] = val + + if convert: + return maybe_convert_objects(result, try_float=0, + convert_datetime=0) + + return result + +def to_object_array(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + list row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + for i from 0 <= i < n: + row = rows[i] + + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + +def tuples_to_object_array(ndarray[object] tuples): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple tup + + n = len(tuples) + k = len(tuples[0]) + result = np.empty((n, k), dtype=object) + for i in range(n): + tup = tuples[i] + for j in range(k): + result[i, j] = tup[j] + + return result + +def to_object_array_tuples(list rows): + cdef: + Py_ssize_t i, j, n, k, tmp + ndarray[object, ndim=2] result + tuple row + + n = len(rows) + + k = 0 + for i from 0 <= i < n: + tmp = len(rows[i]) + if tmp > k: + k = tmp + + result = np.empty((n, k), dtype=object) + + try: + for i in range(n): + row = rows[i] + for j from 0 <= j < len(row): + result[i, j] = row[j] + except Exception: + # upcast any subclasses to tuple + for i in range(n): + row = tuple(rows[i]) + for j from 0 <= j < len(row): + result[i, j] = row[j] + + return result + + +def fast_multiget(dict mapping, ndarray keys, default=np.nan): + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + for i in range(n): + val = util.get_value_1d(keys, i) + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) + diff --git a/pandas/src/join.pyx b/pandas/src/join.pyx new file mode 100644 index 00000000..06d00fe2 --- /dev/null +++ b/pandas/src/join.pyx @@ -0,0 +1,267 @@ +import time + +def inner_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups): + cdef: + Py_ssize_t i, j, k, count = 0 + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc + + # NA group in location 0 + + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + count += lc * rc + + # group 0 is the NA group + cdef: + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + return (_get_result_indexer(left_sorter, left_indexer), + _get_result_indexer(right_sorter, right_indexer)) + +def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups, sort=True): + cdef: + Py_ssize_t i, j, k, count = 0 + ndarray[int64_t] left_count, right_count + ndarray left_sorter, right_sorter, rev + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc + + # NA group in location 0 + + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + if right_count[i] > 0: + count += left_count[i] * right_count[i] + else: + count += left_count[i] + + # group 0 is the NA group + cdef: + Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t offset + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + left_indexer = _get_result_indexer(left_sorter, left_indexer) + right_indexer = _get_result_indexer(right_sorter, right_indexer) + + if not sort: + if left_sorter.dtype != np.int_: + left_sorter = left_sorter.astype(np.int_) + + rev = np.empty(len(left), dtype=np.int_) + rev.put(left_sorter, np.arange(len(left))) + + right_indexer = right_indexer.take(rev) + left_indexer = left_indexer.take(rev) + + return left_indexer, right_indexer + + + +def full_outer_join(ndarray[int64_t] left, ndarray[int64_t] right, + Py_ssize_t max_groups): + cdef: + Py_ssize_t i, j, k, count = 0 + ndarray[int64_t] left_count, right_count, left_sorter, right_sorter + ndarray[int64_t] left_indexer, right_indexer + int64_t lc, rc + + # NA group in location 0 + + left_sorter, left_count = groupsort_indexer(left, max_groups) + right_sorter, right_count = groupsort_indexer(right, max_groups) + + # First pass, determine size of result set, do not use the NA group + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc > 0 and lc > 0: + count += lc * rc + else: + count += lc + rc + + # group 0 is the NA group + cdef: + int64_t left_pos = 0, right_pos = 0 + Py_ssize_t offset, position = 0 + + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] + + left_indexer = np.empty(count, dtype=np.int64) + right_indexer = np.empty(count, dtype=np.int64) + + for i in range(1, max_groups + 1): + lc = left_count[i] + rc = right_count[i] + + if rc == 0: + for j in range(lc): + left_indexer[position + j] = left_pos + j + right_indexer[position + j] = -1 + position += lc + elif lc == 0: + for j in range(rc): + left_indexer[position + j] = -1 + right_indexer[position + j] = right_pos + j + position += rc + else: + for j in range(lc): + offset = position + j * rc + for k in range(rc): + left_indexer[offset + k] = left_pos + j + right_indexer[offset + k] = right_pos + k + position += lc * rc + left_pos += lc + right_pos += rc + + return (_get_result_indexer(left_sorter, left_indexer), + _get_result_indexer(right_sorter, right_indexer)) + + + +def _get_result_indexer(sorter, indexer): + if indexer.dtype != np.int_: + indexer = indexer.astype(np.int_) + res = sorter.take(indexer) + np.putmask(res, indexer == -1, -1) + return res + + + +def ffill_indexer(ndarray[int64_t] indexer): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] result + int64_t val, last_obs + + result = np.empty(n, dtype=np.int64) + last_obs = -1 + + for i in range(n): + val = indexer[i] + if val == -1: + result[i] = last_obs + else: + result[i] = val + last_obs = val + + return result + + +def ffill_by_group(ndarray[int64_t] indexer, ndarray[int64_t] group_ids, + int64_t max_group): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] result, last_obs + int64_t gid, val + + result = np.empty(n, dtype=np.int64) + + last_obs = np.empty(max_group, dtype=np.int64) + last_obs.fill(-1) + + for i in range(n): + gid = group_ids[i] + val = indexer[i] + if val == -1: + result[i] = last_obs[gid] + else: + result[i] = val + last_obs[gid] = val + + return result + + +@cython.boundscheck(False) +@cython.wraparound(False) +def join_sorter(ndarray[int64_t] index, Py_ssize_t ngroups): + cdef: + Py_ssize_t i, loc, label, n + ndarray[int64_t] counts, where, result + + # count group sizes, location 0 for NA + counts = np.zeros(ngroups + 1, dtype=np.int64) + n = len(index) + for i from 0 <= i < n: + counts[index[i] + 1] += 1 + + # mark the start of each contiguous group of like-indexed data + where = np.zeros(ngroups + 1, dtype=np.int64) + for i from 1 <= i < ngroups + 1: + where[i] = where[i - 1] + counts[i - 1] + + # this is our indexer + result = np.zeros(n, dtype=np.int64) + for i from 0 <= i < n: + label = index[i] + 1 + result[where[label]] = i + where[label] += 1 + + return result, counts + +def _big_join_sorter(index): + pass diff --git a/pandas/src/khash.h b/pandas/src/khash.h new file mode 100644 index 00000000..ac08587b --- /dev/null +++ b/pandas/src/khash.h @@ -0,0 +1,608 @@ +/* The MIT License + + Copyright (c) 2008, 2009, 2011 by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "khash.h" +KHASH_MAP_INIT_INT(32, char) +int main() { + int ret, is_missing; + khiter_t k; + khash_t(32) *h = kh_init(32); + k = kh_put(32, h, 5, &ret); + if (!ret) kh_del(32, h, k); + kh_value(h, k) = 10; + k = kh_get(32, h, 10); + is_missing = (k == kh_end(h)); + k = kh_get(32, h, 5); + kh_del(32, h, k); + for (k = kh_begin(h); k != kh_end(h); ++k) + if (kh_exist(h, k)) kh_value(h, k) = 1; + kh_destroy(32, h); + return 0; +} +*/ + +/* + 2011-09-16 (0.2.6): + + * The capacity is a power of 2. This seems to dramatically improve the + speed for simple keys. Thank Zilong Tan for the suggestion. Reference: + + - http://code.google.com/p/ulib/ + - http://nothings.org/computer/judy/ + + * Allow to optionally use linear probing which usually has better + performance for random input. Double hashing is still the default as it + is more robust to certain non-random input. + + * Added Wang's integer hash function (not used by default). This hash + function is more robust to certain non-random input. + + 2011-02-14 (0.2.5): + + * Allow to declare global functions. + + 2009-09-26 (0.2.4): + + * Improve portability + + 2008-09-19 (0.2.3): + + * Corrected the example + * Improved interfaces + + 2008-09-11 (0.2.2): + + * Improved speed a little in kh_put() + + 2008-09-10 (0.2.1): + + * Added kh_clear() + * Fixed a compiling error + + 2008-09-02 (0.2.0): + + * Changed to token concatenation which increases flexibility. + + 2008-08-31 (0.1.2): + + * Fixed a bug in kh_get(), which has not been tested previously. + + 2008-08-31 (0.1.1): + + * Added destructor +*/ + + +#ifndef __AC_KHASH_H +#define __AC_KHASH_H + +/*! + @header + + Generic hash table library. + */ + +#define AC_VERSION_KHASH_H "0.2.6" + +#include +#include +#include +#include + + +#if UINT_MAX == 0xffffffffu +typedef unsigned int khint32_t; +#elif ULONG_MAX == 0xffffffffu +typedef unsigned long khint32_t; +#endif + +#if ULONG_MAX == ULLONG_MAX +typedef unsigned long khuint64_t; +typedef signed long khint64_t; +#else +typedef unsigned long long khuint64_t; +typedef signed long long khint64_t; +#endif + +typedef double khfloat64_t; + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +typedef khint32_t khint_t; +typedef khint_t khiter_t; + +#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) +#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) +#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) +#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) +#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) +#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) +#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) + +#ifdef KHASH_LINEAR +#define __ac_inc(k, m) 1 +#else +#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) +#endif + +#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) + +#ifndef kroundup32 +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif + +static const double __ac_HASH_UPPER = 0.77; + +#define KHASH_DECLARE(name, khkey_t, khval_t) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + extern kh_##name##_t *kh_init_##name(); \ + extern void kh_destroy_##name(kh_##name##_t *h); \ + extern void kh_clear_##name(kh_##name##_t *h); \ + extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ + extern void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ + extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ + extern void kh_del_##name(kh_##name##_t *h, khint_t x); + +#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + typedef struct { \ + khint_t n_buckets, size, n_occupied, upper_bound; \ + khint32_t *flags; \ + khkey_t *keys; \ + khval_t *vals; \ + } kh_##name##_t; \ + SCOPE kh_##name##_t *kh_init_##name(void) { \ + return (kh_##name##_t*)calloc(1, sizeof(kh_##name##_t)); \ + } \ + SCOPE void kh_destroy_##name(kh_##name##_t *h) \ + { \ + if (h) { \ + free(h->keys); free(h->flags); \ + free(h->vals); \ + free(h); \ + } \ + } \ + SCOPE void kh_clear_##name(kh_##name##_t *h) \ + { \ + if (h && h->flags) { \ + memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ + h->size = h->n_occupied = 0; \ + } \ + } \ + SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ + { \ + if (h->n_buckets) { \ + khint_t inc, k, i, last, mask; \ + mask = h->n_buckets - 1; \ + k = __hash_func(key); i = k & mask; \ + inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + i = (i + inc) & mask; \ + if (i == last) return h->n_buckets; \ + } \ + return __ac_iseither(h->flags, i)? h->n_buckets : i; \ + } else return 0; \ + } \ + SCOPE void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ + { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ + khint32_t *new_flags = 0; \ + khint_t j = 1; \ + { \ + kroundup32(new_n_buckets); \ + if (new_n_buckets < 4) new_n_buckets = 4; \ + if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ + else { /* hash table size to be changed (shrink or expand); rehash */ \ + new_flags = (khint32_t*)malloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ + if (h->n_buckets < new_n_buckets) { /* expand */ \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } /* otherwise shrink */ \ + } \ + } \ + if (j) { /* rehashing is needed */ \ + for (j = 0; j != h->n_buckets; ++j) { \ + if (__ac_iseither(h->flags, j) == 0) { \ + khkey_t key = h->keys[j]; \ + khval_t val; \ + khint_t new_mask; \ + new_mask = new_n_buckets - 1; \ + if (kh_is_map) val = h->vals[j]; \ + __ac_set_isdel_true(h->flags, j); \ + while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ + khint_t inc, k, i; \ + k = __hash_func(key); \ + i = k & new_mask; \ + inc = __ac_inc(k, new_mask); \ + while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ + __ac_set_isempty_false(new_flags, i); \ + if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ + { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ + if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ + __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ + } else { /* write the element and jump out of the loop */ \ + h->keys[i] = key; \ + if (kh_is_map) h->vals[i] = val; \ + break; \ + } \ + } \ + } \ + } \ + if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ + h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \ + if (kh_is_map) h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \ + } \ + free(h->flags); /* free the working space */ \ + h->flags = new_flags; \ + h->n_buckets = new_n_buckets; \ + h->n_occupied = h->size; \ + h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ + } \ + } \ + SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ + { \ + khint_t x; \ + if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ + if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); /* clear "deleted" elements */ \ + else kh_resize_##name(h, h->n_buckets + 1); /* expand the hash table */ \ + } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ + { \ + khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ + x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ + if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ + else { \ + inc = __ac_inc(k, mask); last = i; \ + while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ + if (__ac_isdel(h->flags, i)) site = i; \ + i = (i + inc) & mask; \ + if (i == last) { x = site; break; } \ + } \ + if (x == h->n_buckets) { \ + if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ + else x = i; \ + } \ + } \ + } \ + if (__ac_isempty(h->flags, x)) { /* not present at all */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; ++h->n_occupied; \ + *ret = 1; \ + } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ + h->keys[x] = key; \ + __ac_set_isboth_false(h->flags, x); \ + ++h->size; \ + *ret = 2; \ + } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ + return x; \ + } \ + SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ + { \ + if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ + __ac_set_isdel_true(h->flags, x); \ + --h->size; \ + } \ + } + +#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ + KHASH_INIT2(name, static PANDAS_INLINE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) + +/* --- BEGIN OF HASH FUNCTIONS --- */ + +/*! @function + @abstract Integer hash function + @param key The integer [khint32_t] + @return The hash value [khint_t] + */ +#define kh_int_hash_func(key) (khint32_t)(key) +/*! @function + @abstract Integer comparison function + */ +#define kh_int_hash_equal(a, b) ((a) == (b)) +/*! @function + @abstract 64-bit integer hash function + @param key The integer [khint64_t] + @return The hash value [khint_t] + */ +#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) +/*! @function + @abstract 64-bit integer comparison function + */ +#define kh_int64_hash_equal(a, b) ((a) == (b)) + +// kludge + +#define kh_float64_hash_func _Py_HashDouble +#define kh_float64_hash_equal kh_int64_hash_equal + +/*! @function + @abstract const char* hash function + @param s Pointer to a null terminated string + @return The hash value + */ +static PANDAS_INLINE khint_t __ac_X31_hash_string(const char *s) +{ + khint_t h = *s; + if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s; + return h; +} +/*! @function + @abstract Another interface to const char* hash function + @param key Pointer to a null terminated string [const char*] + @return The hash value [khint_t] + */ +#define kh_str_hash_func(key) __ac_X31_hash_string(key) +/*! @function + @abstract Const char* comparison function + */ +#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) + +static PANDAS_INLINE khint_t __ac_Wang_hash(khint_t key) +{ + key += ~(key << 15); + key ^= (key >> 10); + key += (key << 3); + key ^= (key >> 6); + key += ~(key << 11); + key ^= (key >> 16); + return key; +} +#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) + +/* --- END OF HASH FUNCTIONS --- */ + +/* Other convenient macros... */ + +/*! + @abstract Type of the hash table. + @param name Name of the hash table [symbol] + */ +#define khash_t(name) kh_##name##_t + +/*! @function + @abstract Initiate a hash table. + @param name Name of the hash table [symbol] + @return Pointer to the hash table [khash_t(name)*] + */ +#define kh_init(name) kh_init_##name(void) + +/*! @function + @abstract Destroy a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_destroy(name, h) kh_destroy_##name(h) + +/*! @function + @abstract Reset a hash table without deallocating memory. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + */ +#define kh_clear(name, h) kh_clear_##name(h) + +/*! @function + @abstract Resize a hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param s New size [khint_t] + */ +#define kh_resize(name, h, s) kh_resize_##name(h, s) + +/*! @function + @abstract Insert a key to the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @param r Extra return code: 0 if the key is present in the hash table; + 1 if the bucket is empty (never used); 2 if the element in + the bucket has been deleted [int*] + @return Iterator to the inserted element [khint_t] + */ +#define kh_put(name, h, k, r) kh_put_##name(h, k, r) + +/*! @function + @abstract Retrieve a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Key [type of keys] + @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] + */ +#define kh_get(name, h, k) kh_get_##name(h, k) + +/*! @function + @abstract Remove a key from the hash table. + @param name Name of the hash table [symbol] + @param h Pointer to the hash table [khash_t(name)*] + @param k Iterator to the element to be deleted [khint_t] + */ +#define kh_del(name, h, k) kh_del_##name(h, k) + +/*! @function + @abstract Test whether a bucket contains data. + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return 1 if containing data; 0 otherwise [int] + */ +#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) + +/*! @function + @abstract Get key given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Key [type of keys] + */ +#define kh_key(h, x) ((h)->keys[x]) + +/*! @function + @abstract Get value given an iterator + @param h Pointer to the hash table [khash_t(name)*] + @param x Iterator to the bucket [khint_t] + @return Value [type of values] + @discussion For hash sets, calling this results in segfault. + */ +#define kh_val(h, x) ((h)->vals[x]) + +/*! @function + @abstract Alias of kh_val() + */ +#define kh_value(h, x) ((h)->vals[x]) + +/*! @function + @abstract Get the start iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The start iterator [khint_t] + */ +#define kh_begin(h) (khint_t)(0) + +/*! @function + @abstract Get the end iterator + @param h Pointer to the hash table [khash_t(name)*] + @return The end iterator [khint_t] + */ +#define kh_end(h) ((h)->n_buckets) + +/*! @function + @abstract Get the number of elements in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of elements in the hash table [khint_t] + */ +#define kh_size(h) ((h)->size) + +/*! @function + @abstract Get the number of buckets in the hash table + @param h Pointer to the hash table [khash_t(name)*] + @return Number of buckets in the hash table [khint_t] + */ +#define kh_n_buckets(h) ((h)->n_buckets) + +/* More conenient interfaces */ + +/*! @function + @abstract Instantiate a hash set containing integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_INT(name) \ + KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_INT(name, khval_t) \ + KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_UINT64(name) \ + KHASH_INIT(name, khuint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +#define KHASH_SET_INIT_INT64(name) \ + KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing 64-bit integer keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_UINT64(name, khval_t) \ + KHASH_INIT(name, khuint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +#define KHASH_MAP_INIT_INT64(name, khval_t) \ + KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) + +#define KHASH_MAP_INIT_FLOAT64(name, khval_t) \ + KHASH_INIT(name, khfloat64_t, khval_t, 1, kh_float64_hash_func, kh_float64_hash_equal) + +typedef const char *kh_cstr_t; +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + */ +#define KHASH_SET_INIT_STR(name) \ + KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) + +/*! @function + @abstract Instantiate a hash map containing const char* keys + @param name Name of the hash table [symbol] + @param khval_t Type of values [type] + */ +#define KHASH_MAP_INIT_STR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) + + +#include + +#define kh_python_hash_func(key) (PyObject_Hash(key)) +#define kh_python_hash_equal(a, b) ((a == b) || PyObject_RichCompareBool(a, b, Py_EQ)) + + +// Python object + +typedef PyObject* kh_pyobject_t; + +#define KHASH_MAP_INIT_PYOBJECT(name, khval_t) \ + KHASH_INIT(name, kh_pyobject_t, khval_t, 1, kh_python_hash_func, kh_python_hash_equal) + +KHASH_MAP_INIT_PYOBJECT(pymap, Py_ssize_t) + +#define KHASH_SET_INIT_PYOBJECT(name) \ + KHASH_INIT(name, kh_pyobject_t, char, 0, kh_python_hash_func, kh_python_hash_equal) + +KHASH_SET_INIT_PYOBJECT(pyset) + +#define kh_exist_pymap(h, k) (kh_exist(h, k)) +#define kh_exist_pyset(h, k) (kh_exist(h, k)) +#define kh_exist_str(h, k) (kh_exist(h, k)) +#define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_int64(h, k) (kh_exist(h, k)) +#define kh_exist_int32(h, k) (kh_exist(h, k)) + +KHASH_MAP_INIT_STR(str, Py_ssize_t) + +KHASH_MAP_INIT_INT(int32, Py_ssize_t) +KHASH_MAP_INIT_INT64(int64, Py_ssize_t) +KHASH_MAP_INIT_FLOAT64(float64, Py_ssize_t) + +#endif /* __AC_KHASH_H */ diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd new file mode 100644 index 00000000..ef3f355f --- /dev/null +++ b/pandas/src/khash.pxd @@ -0,0 +1,104 @@ +from cpython cimport PyObject +from numpy cimport int64_t, int32_t, uint32_t, float64_t + +cdef extern from "khash.h": + ctypedef uint32_t khint_t + ctypedef khint_t khiter_t + + ctypedef struct kh_pymap_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + PyObject **keys + Py_ssize_t *vals + + inline kh_pymap_t* kh_init_pymap() + inline void kh_destroy_pymap(kh_pymap_t*) + inline void kh_clear_pymap(kh_pymap_t*) + inline khint_t kh_get_pymap(kh_pymap_t*, PyObject*) + inline void kh_resize_pymap(kh_pymap_t*, khint_t) + inline khint_t kh_put_pymap(kh_pymap_t*, PyObject*, int*) + inline void kh_del_pymap(kh_pymap_t*, khint_t) + + bint kh_exist_pymap(kh_pymap_t*, khiter_t) + + ctypedef struct kh_pyset_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + PyObject **keys + Py_ssize_t *vals + + inline kh_pyset_t* kh_init_pyset() + inline void kh_destroy_pyset(kh_pyset_t*) + inline void kh_clear_pyset(kh_pyset_t*) + inline khint_t kh_get_pyset(kh_pyset_t*, PyObject*) + inline void kh_resize_pyset(kh_pyset_t*, khint_t) + inline khint_t kh_put_pyset(kh_pyset_t*, PyObject*, int*) + inline void kh_del_pyset(kh_pyset_t*, khint_t) + + bint kh_exist_pyset(kh_pyset_t*, khiter_t) + + ctypedef char* kh_cstr_t + + ctypedef struct kh_str_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + kh_cstr_t *keys + Py_ssize_t *vals + + inline kh_str_t* kh_init_str() + inline void kh_destroy_str(kh_str_t*) + inline void kh_clear_str(kh_str_t*) + inline khint_t kh_get_str(kh_str_t*, kh_cstr_t) + inline void kh_resize_str(kh_str_t*, khint_t) + inline khint_t kh_put_str(kh_str_t*, kh_cstr_t, int*) + inline void kh_del_str(kh_str_t*, khint_t) + + bint kh_exist_str(kh_str_t*, khiter_t) + + ctypedef struct kh_int64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int64_t *keys + Py_ssize_t *vals + + inline kh_int64_t* kh_init_int64() + inline void kh_destroy_int64(kh_int64_t*) + inline void kh_clear_int64(kh_int64_t*) + inline khint_t kh_get_int64(kh_int64_t*, int64_t) + inline void kh_resize_int64(kh_int64_t*, khint_t) + inline khint_t kh_put_int64(kh_int64_t*, int64_t, int*) + inline void kh_del_int64(kh_int64_t*, khint_t) + + bint kh_exist_int64(kh_int64_t*, khiter_t) + + ctypedef struct kh_float64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + float64_t *keys + Py_ssize_t *vals + + inline kh_float64_t* kh_init_float64() + inline void kh_destroy_float64(kh_float64_t*) + inline void kh_clear_float64(kh_float64_t*) + inline khint_t kh_get_float64(kh_float64_t*, float64_t) + inline void kh_resize_float64(kh_float64_t*, khint_t) + inline khint_t kh_put_float64(kh_float64_t*, float64_t, int*) + inline void kh_del_float64(kh_float64_t*, khint_t) + + bint kh_exist_float64(kh_float64_t*, khiter_t) + + ctypedef struct kh_int32_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + int32_t *keys + Py_ssize_t *vals + + inline kh_int32_t* kh_init_int32() + inline void kh_destroy_int32(kh_int32_t*) + inline void kh_clear_int32(kh_int32_t*) + inline khint_t kh_get_int32(kh_int32_t*, int32_t) + inline void kh_resize_int32(kh_int32_t*, khint_t) + inline khint_t kh_put_int32(kh_int32_t*, int32_t, int*) + inline void kh_del_int32(kh_int32_t*, khint_t) + + bint kh_exist_int32(kh_int32_t*, khiter_t) diff --git a/pandas/src/ktypes.h b/pandas/src/ktypes.h new file mode 100644 index 00000000..981f1737 --- /dev/null +++ b/pandas/src/ktypes.h @@ -0,0 +1,6 @@ +#ifndef __KTYPES_H +#define __KTYPES_H + +/* compipler specific configuration */ + +#endif /* __KTYPES_H */ diff --git a/pandas/src/kvec.h b/pandas/src/kvec.h new file mode 100644 index 00000000..ab9e0b65 --- /dev/null +++ b/pandas/src/kvec.h @@ -0,0 +1,142 @@ +/* The MIT License + + Copyright (c) 2008, by Attractive Chaos + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +/* + An example: + +#include "kvec.h" +int main() { + kvec_t(int) array; + kv_init(array); + kv_push(int, array, 10); // append + kv_a(int, array, 20) = 5; // dynamic + kv_A(array, 20) = 4; // static + kv_destroy(array); + return 0; +} +*/ + +/* + 2008-09-22 (0.1.0): + + * The initial version. + +*/ + +#ifndef AC_KVEC_H +#define AC_KVEC_H + +#include +#include +#include + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) + +#define kvec_t(type) struct { size_t n, m; type *a; } +#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) +#define kv_destroy(v) free((v).a) +#define kv_A(v, i) ((v).a[(i)]) +#define kv_pop(v) ((v).a[--(v).n]) +#define kv_size(v) ((v).n) +#define kv_max(v) ((v).m) + +#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) + +#define kv_copy(type, v1, v0) do { \ + if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ + (v1).n = (v0).n; \ + memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ + } while (0) \ + +#define kv_push(type, v, x) do { \ + if ((v)->n == (v)->m) { \ + (v)->m = (v)->m? (v)->m<<1 : 2; \ + (v)->a = (type*)realloc((v)->a, sizeof(type) * (v)->m); \ + } \ + (v)->a[(v)->n++] = (x); \ + } while (0) + +#define kv_pushp(type, v) (((v).n == (v).m)? \ + ((v).m = ((v).m? (v).m<<1 : 2), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : 0), ((v).a + ((v).n++)) + +#define kv_a(type, v, i) ((v).m <= (size_t)(i)? \ + ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ + (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ + : (v).n <= (size_t)(i)? (v).n = (i) \ + : 0), (v).a[(i)] + +// #define kv_int64_push(v, x) (kv_push(int64_t, (v), (x))) + +typedef struct { + size_t n, m; + int64_t* a; +} kv_int64_t; + +typedef struct { + size_t n, m; + PyObject** a; +} kv_object_t; + +void PANDAS_INLINE kv_object_push(kv_object_t *v, PyObject *x) { + do { + if (v->n == v->m) { + v->m = v->m? v->m<<1 : 2; + v->a = (PyObject**)realloc(v->a, sizeof(PyObject*) * v->m); + } + v->a[v->n++] = x; + } while (0); + // kv_push(PyObject*, v, x); + Py_INCREF(x); +} + +void PANDAS_INLINE kv_int64_push(kv_int64_t *v, int64_t x) { + kv_push(int64_t, v, x); +} + +void PANDAS_INLINE kv_object_destroy(kv_object_t *v) { + int i; + for (i = 0; i < v->n; ++i) + { + Py_XDECREF(v->a[i]); + } + free(v->a); +} + + +#endif diff --git a/pandas/src/moments.pyx b/pandas/src/moments.pyx new file mode 100644 index 00000000..88633279 --- /dev/null +++ b/pandas/src/moments.pyx @@ -0,0 +1,726 @@ +# Cython implementations of rolling sum, mean, variance, skewness, +# other statistical moment functions +# +# Misc implementation notes +# ------------------------- +# +# - In Cython x * x is faster than x ** 2 for C types, this should be +# periodically revisited to see if it's still true. +# +# - + +# original C implementation by N. Devillard. +# This code in public domain. +# Function : kth_smallest() +# In : array of elements, # of elements in the array, rank k +# Out : one element +# Job : find the kth smallest element in the array + +# Reference: + +# Author: Wirth, Niklaus +# Title: Algorithms + data structures = programs +# Publisher: Englewood Cliffs: Prentice-Hall, 1976 +# Physical description: 366 p. +# Series: Prentice-Hall Series in Automatic Computation + + +def kth_smallest(ndarray[double_t] a, Py_ssize_t k): + cdef: + Py_ssize_t i,j,l,m,n + double_t x, t + + n = len(a) + + l = 0 + m = n-1 + while (l j: break + + if j < k: l = i + if k < i: m = j + return a[k] + + +def median(ndarray arr): + ''' + A faster median + ''' + cdef int n = len(arr) + + if len(arr) == 0: + return np.NaN + + arr = arr.copy() + + if n % 2: + return kth_smallest(arr, n / 2) + else: + return (kth_smallest(arr, n / 2) + + kth_smallest(arr, n / 2 - 1)) / 2 + +# -------------- Min, Max subsequence + +def max_subseq(ndarray[double_t] arr): + cdef: + Py_ssize_t i=0,s=0,e=0,T,n + double m, S + + n = len(arr) + + if len(arr) == 0: + return (-1,-1,None) + + m = arr[0] + S = m + T = 0 + + for i in range(1, n): + # S = max { S + A[i], A[i] ) + if (S > 0): + S = S + arr[i] + else: + S = arr[i] + T = i + if S > m: + s = T + e = i + m = S + + return (s, e, m) + +def min_subseq(ndarray[double_t] arr): + cdef: + Py_ssize_t s, e + double m + + (s, e, m) = max_subseq(-arr) + + return (s, e, -m) + +#------------------------------------------------------------------------------- +# Rolling sum + +def roll_sum(ndarray[double_t] input, int win, int minp): + cdef double val, prev, sum_x = 0 + cdef int nobs = 0, i + cdef int N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + + if val == val: + nobs += 1 + sum_x += val + + if nobs >= minp: + output[i] = sum_x + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling mean + +def roll_mean(ndarray[double_t] input, + int win, int minp): + cdef double val, prev, sum_x = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + nobs -= 1 + + if val == val: + nobs += 1 + sum_x += val + + if nobs >= minp: + output[i] = sum_x / nobs + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Exponentially weighted moving average + +def ewma(ndarray[double_t] input, double_t com): + ''' + Compute exponentially-weighted moving average using center-of-mass. + + Parameters + ---------- + input : ndarray (float64 type) + com : float64 + + Returns + ------- + y : ndarray + ''' + + cdef double cur, prev, neww, oldw, adj + cdef Py_ssize_t i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + + neww = 1. / (1. + com) + oldw = 1. - neww + adj = oldw + + output[0] = neww * input[0] + + for i from 1 <= i < N: + cur = input[i] + prev = output[i - 1] + + if cur == cur: + if prev == prev: + output[i] = oldw * prev + neww * cur + else: + output[i] = neww * cur + else: + output[i] = prev + + for i from 0 <= i < N: + cur = input[i] + output[i] = output[i] / (1. - adj) + + if cur == cur: + adj *= oldw + + return output + +#---------------------------------------------------------------------- +# Pairwise correlation/covariance + +@cython.boundscheck(False) +@cython.wraparound(False) +def nancorr(ndarray[float64_t, ndim=2] mat, cov=False): + cdef: + Py_ssize_t i, j, xi, yi, N, K + ndarray[float64_t, ndim=2] result + ndarray[uint8_t, ndim=2] mask + int64_t nobs = 0 + float64_t vx, vy, sumx, sumy, sumxx, sumyy, meanx, meany, divisor + + N, K = ( mat).shape + + result = np.empty((K, K), dtype=np.float64) + mask = np.isfinite(mat).view(np.uint8) + + for xi in range(K): + for yi in range(xi + 1): + nobs = sumxx = sumyy = sumx = sumy = 0 + for i in range(N): + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] + vy = mat[i, yi] + nobs += 1 + sumx += vx + sumy += vy + + if nobs == 0: + result[xi, yi] = result[yi, xi] = np.NaN + else: + meanx = sumx / nobs + meany = sumy / nobs + + # now the cov numerator + sumx = 0 + + for i in range(N): + if mask[i, xi] and mask[i, yi]: + vx = mat[i, xi] - meanx + vy = mat[i, yi] - meany + + sumx += vx * vy + sumxx += vx * vx + sumyy += vy * vy + + divisor = (nobs - 1.0) if cov else sqrt(sumxx * sumyy) + + if divisor != 0: + result[xi, yi] = result[yi, xi] = sumx / divisor + else: + result[xi, yi] = result[yi, xi] = np.NaN + + return result + +#---------------------------------------------------------------------- +# Rolling variance + +def _check_minp(minp, N): + if minp > N: + minp = N + 1 + elif minp == 0: + minp = 1 + elif minp < 0: + raise ValueError('min_periods must be >= 0') + return minp + +def roll_var(ndarray[double_t] input, int win, int minp, int ddof=1): + cdef double val, prev, sum_x = 0, sum_xx = 0, nobs = 0 + cdef Py_ssize_t i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + sum_x += val + sum_xx += val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + sum_x -= prev + sum_xx -= prev * prev + nobs -= 1 + + if val == val: + nobs += 1 + sum_x += val + sum_xx += val * val + + if nobs >= minp: + output[i] = (nobs * sum_xx - sum_x * sum_x) / (nobs * (nobs - ddof)) + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling skewness + +def roll_skew(ndarray[double_t] input, int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + # 3 components of the skewness equation + cdef double A, B, C, R + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + + nobs -= 1 + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + + if nobs >= minp: + A = x / nobs + B = xx / nobs - A * A + C = xxx / nobs - A * A * A - 3 * A * B + + R = sqrt(B) + + output[i] = ((sqrt(nobs * (nobs - 1.)) * C) / + ((nobs-2) * R * R * R)) + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling kurtosis + + +def roll_kurt(ndarray[double_t] input, + int win, int minp): + cdef double val, prev + cdef double x = 0, xx = 0, xxx = 0, xxxx = 0 + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + # 5 components of the kurtosis equation + cdef double A, B, C, D, R, K + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + + # seriously don't ask me why this is faster + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + if prev == prev: + x -= prev + xx -= prev * prev + xxx -= prev * prev * prev + xxxx -= prev * prev * prev * prev + + nobs -= 1 + + if val == val: + nobs += 1 + x += val + xx += val * val + xxx += val * val * val + xxxx += val * val * val * val + + if nobs >= minp: + A = x / nobs + R = A * A + B = xx / nobs - R + R = R * A + C = xxx / nobs - R - 3 * A * B + R = R * A + D = xxxx / nobs - R - 6*B*A*A - 4*C*A + + K = (nobs * nobs - 1.)*D/(B*B) - 3*((nobs-1.)**2) + K = K / ((nobs - 2.)*(nobs-3.)) + + output[i] = K + else: + output[i] = NaN + + return output + +#------------------------------------------------------------------------------- +# Rolling median, min, max + +ctypedef double_t (* skiplist_f)(object sl, int n, int p) + +cdef _roll_skiplist_op(ndarray arg, int win, int minp, skiplist_f op): + cdef ndarray[double_t] input = arg + cdef double val, prev, midpoint + cdef IndexableSkiplist skiplist + cdef Py_ssize_t nobs = 0, i + + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + skiplist = IndexableSkiplist(win) + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + + if prev == prev: + skiplist.remove(prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = op(skiplist, nobs, minp) + + return output + +from skiplist cimport * + +def roll_median_c(ndarray[float64_t] arg, int win, int minp): + cdef double val, res, prev + cdef: + int ret=0 + skiplist_t *sl + Py_ssize_t midpoint, nobs = 0, i + + + cdef Py_ssize_t N = len(arg) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + sl = skiplist_init(win) + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = arg[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist_insert(sl, val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = arg[i] + + if i > win - 1: + prev = arg[i - win] + + if prev == prev: + skiplist_remove(sl, prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist_insert(sl, val) + + if nobs >= minp: + midpoint = nobs / 2 + if nobs % 2: + res = skiplist_get(sl, midpoint, &ret) + else: + res = (skiplist_get(sl, midpoint, &ret) + + skiplist_get(sl, (midpoint - 1), &ret)) / 2 + else: + res = NaN + + output[i] = res + + skiplist_destroy(sl) + + return output + +def roll_median_cython(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_median) + +# Unfortunately had to resort to some hackery here, would like for +# Cython to be able to get this right. + +cdef double_t _get_median(object sl, int nobs, int minp): + cdef Py_ssize_t midpoint + cdef IndexableSkiplist skiplist = sl + if nobs >= minp: + midpoint = nobs / 2 + if nobs % 2: + return skiplist.get(midpoint) + else: + return (skiplist.get(midpoint) + + skiplist.get(midpoint - 1)) / 2 + else: + return NaN + +def roll_max(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_max) + +cdef double_t _get_max(object skiplist, int nobs, int minp): + if nobs >= minp: + return skiplist.get(nobs - 1) + else: + return NaN + +def roll_min(ndarray input, int win, int minp): + ''' + O(N log(window)) implementation using skip list + ''' + return _roll_skiplist_op(input, win, minp, _get_min) + +cdef double_t _get_min(object skiplist, int nobs, int minp): + if nobs >= minp: + return skiplist.get(0) + else: + return NaN + +def roll_quantile(ndarray[float64_t, cast=True] input, int win, + int minp, double quantile): + ''' + O(N log(window)) implementation using skip list + ''' + cdef double val, prev, midpoint + cdef IndexableSkiplist skiplist + cdef Py_ssize_t nobs = 0, i + cdef Py_ssize_t N = len(input) + cdef ndarray[double_t] output = np.empty(N, dtype=float) + + skiplist = IndexableSkiplist(win) + + minp = _check_minp(minp, N) + + for i from 0 <= i < minp - 1: + val = input[i] + + # Not NaN + if val == val: + nobs += 1 + skiplist.insert(val) + + output[i] = NaN + + for i from minp - 1 <= i < N: + val = input[i] + + if i > win - 1: + prev = input[i - win] + + if prev == prev: + skiplist.remove(prev) + nobs -= 1 + + if val == val: + nobs += 1 + skiplist.insert(val) + + if nobs >= minp: + idx = int((quantile / 1.) * (nobs - 1)) + output[i] = skiplist.get(idx) + else: + output[i] = NaN + + return output + +def roll_generic(ndarray[float64_t, cast=True] input, int win, + int minp, object func): + cdef ndarray[double_t] output, counts, bufarr + cdef Py_ssize_t i, n + cdef float64_t *buf, *oldbuf + + if not input.flags.c_contiguous: + input = input.copy('C') + + buf = input.data + + n = len(input) + if n == 0: + return input + + minp = _check_minp(minp, n) + output = np.empty(n, dtype=float) + counts = roll_sum(np.isfinite(input).astype(float), win, minp) + + bufarr = np.empty(win, dtype=float) + oldbuf = bufarr.data + + n = len(input) + for i from 0 <= i < win: + if counts[i] >= minp: + output[i] = func(input[int_max(i - win + 1, 0) : i + 1]) + else: + output[i] = NaN + + for i from win <= i < n: + buf = buf + 1 + bufarr.data = buf + if counts[i] >= minp: + output[i] = func(bufarr) + else: + output[i] = NaN + + bufarr.data = oldbuf + + return output diff --git a/pandas/src/ms_inttypes.h b/pandas/src/ms_inttypes.h new file mode 100644 index 00000000..1be38033 --- /dev/null +++ b/pandas/src/ms_inttypes.h @@ -0,0 +1,305 @@ +// ISO C9x compliant inttypes.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_INTTYPES_H_ // [ +#define _MSC_INTTYPES_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include "ms_stdint.h" + +// 7.8 Format conversion of integer types + +typedef struct { + intmax_t quot; + intmax_t rem; +} imaxdiv_t; + +// 7.8.1 Macros for format specifiers + +#if !defined(__cplusplus) || defined(__STDC_FORMAT_MACROS) // [ See footnote 185 at page 198 + +// The fprintf macros for signed integers are: +#define PRId8 "d" +#define PRIi8 "i" +#define PRIdLEAST8 "d" +#define PRIiLEAST8 "i" +#define PRIdFAST8 "d" +#define PRIiFAST8 "i" + +#define PRId16 "hd" +#define PRIi16 "hi" +#define PRIdLEAST16 "hd" +#define PRIiLEAST16 "hi" +#define PRIdFAST16 "hd" +#define PRIiFAST16 "hi" + +#define PRId32 "I32d" +#define PRIi32 "I32i" +#define PRIdLEAST32 "I32d" +#define PRIiLEAST32 "I32i" +#define PRIdFAST32 "I32d" +#define PRIiFAST32 "I32i" + +#define PRId64 "I64d" +#define PRIi64 "I64i" +#define PRIdLEAST64 "I64d" +#define PRIiLEAST64 "I64i" +#define PRIdFAST64 "I64d" +#define PRIiFAST64 "I64i" + +#define PRIdMAX "I64d" +#define PRIiMAX "I64i" + +#define PRIdPTR "Id" +#define PRIiPTR "Ii" + +// The fprintf macros for unsigned integers are: +#define PRIo8 "o" +#define PRIu8 "u" +#define PRIx8 "x" +#define PRIX8 "X" +#define PRIoLEAST8 "o" +#define PRIuLEAST8 "u" +#define PRIxLEAST8 "x" +#define PRIXLEAST8 "X" +#define PRIoFAST8 "o" +#define PRIuFAST8 "u" +#define PRIxFAST8 "x" +#define PRIXFAST8 "X" + +#define PRIo16 "ho" +#define PRIu16 "hu" +#define PRIx16 "hx" +#define PRIX16 "hX" +#define PRIoLEAST16 "ho" +#define PRIuLEAST16 "hu" +#define PRIxLEAST16 "hx" +#define PRIXLEAST16 "hX" +#define PRIoFAST16 "ho" +#define PRIuFAST16 "hu" +#define PRIxFAST16 "hx" +#define PRIXFAST16 "hX" + +#define PRIo32 "I32o" +#define PRIu32 "I32u" +#define PRIx32 "I32x" +#define PRIX32 "I32X" +#define PRIoLEAST32 "I32o" +#define PRIuLEAST32 "I32u" +#define PRIxLEAST32 "I32x" +#define PRIXLEAST32 "I32X" +#define PRIoFAST32 "I32o" +#define PRIuFAST32 "I32u" +#define PRIxFAST32 "I32x" +#define PRIXFAST32 "I32X" + +#define PRIo64 "I64o" +#define PRIu64 "I64u" +#define PRIx64 "I64x" +#define PRIX64 "I64X" +#define PRIoLEAST64 "I64o" +#define PRIuLEAST64 "I64u" +#define PRIxLEAST64 "I64x" +#define PRIXLEAST64 "I64X" +#define PRIoFAST64 "I64o" +#define PRIuFAST64 "I64u" +#define PRIxFAST64 "I64x" +#define PRIXFAST64 "I64X" + +#define PRIoMAX "I64o" +#define PRIuMAX "I64u" +#define PRIxMAX "I64x" +#define PRIXMAX "I64X" + +#define PRIoPTR "Io" +#define PRIuPTR "Iu" +#define PRIxPTR "Ix" +#define PRIXPTR "IX" + +// The fscanf macros for signed integers are: +#define SCNd8 "d" +#define SCNi8 "i" +#define SCNdLEAST8 "d" +#define SCNiLEAST8 "i" +#define SCNdFAST8 "d" +#define SCNiFAST8 "i" + +#define SCNd16 "hd" +#define SCNi16 "hi" +#define SCNdLEAST16 "hd" +#define SCNiLEAST16 "hi" +#define SCNdFAST16 "hd" +#define SCNiFAST16 "hi" + +#define SCNd32 "ld" +#define SCNi32 "li" +#define SCNdLEAST32 "ld" +#define SCNiLEAST32 "li" +#define SCNdFAST32 "ld" +#define SCNiFAST32 "li" + +#define SCNd64 "I64d" +#define SCNi64 "I64i" +#define SCNdLEAST64 "I64d" +#define SCNiLEAST64 "I64i" +#define SCNdFAST64 "I64d" +#define SCNiFAST64 "I64i" + +#define SCNdMAX "I64d" +#define SCNiMAX "I64i" + +#ifdef _WIN64 // [ +# define SCNdPTR "I64d" +# define SCNiPTR "I64i" +#else // _WIN64 ][ +# define SCNdPTR "ld" +# define SCNiPTR "li" +#endif // _WIN64 ] + +// The fscanf macros for unsigned integers are: +#define SCNo8 "o" +#define SCNu8 "u" +#define SCNx8 "x" +#define SCNX8 "X" +#define SCNoLEAST8 "o" +#define SCNuLEAST8 "u" +#define SCNxLEAST8 "x" +#define SCNXLEAST8 "X" +#define SCNoFAST8 "o" +#define SCNuFAST8 "u" +#define SCNxFAST8 "x" +#define SCNXFAST8 "X" + +#define SCNo16 "ho" +#define SCNu16 "hu" +#define SCNx16 "hx" +#define SCNX16 "hX" +#define SCNoLEAST16 "ho" +#define SCNuLEAST16 "hu" +#define SCNxLEAST16 "hx" +#define SCNXLEAST16 "hX" +#define SCNoFAST16 "ho" +#define SCNuFAST16 "hu" +#define SCNxFAST16 "hx" +#define SCNXFAST16 "hX" + +#define SCNo32 "lo" +#define SCNu32 "lu" +#define SCNx32 "lx" +#define SCNX32 "lX" +#define SCNoLEAST32 "lo" +#define SCNuLEAST32 "lu" +#define SCNxLEAST32 "lx" +#define SCNXLEAST32 "lX" +#define SCNoFAST32 "lo" +#define SCNuFAST32 "lu" +#define SCNxFAST32 "lx" +#define SCNXFAST32 "lX" + +#define SCNo64 "I64o" +#define SCNu64 "I64u" +#define SCNx64 "I64x" +#define SCNX64 "I64X" +#define SCNoLEAST64 "I64o" +#define SCNuLEAST64 "I64u" +#define SCNxLEAST64 "I64x" +#define SCNXLEAST64 "I64X" +#define SCNoFAST64 "I64o" +#define SCNuFAST64 "I64u" +#define SCNxFAST64 "I64x" +#define SCNXFAST64 "I64X" + +#define SCNoMAX "I64o" +#define SCNuMAX "I64u" +#define SCNxMAX "I64x" +#define SCNXMAX "I64X" + +#ifdef _WIN64 // [ +# define SCNoPTR "I64o" +# define SCNuPTR "I64u" +# define SCNxPTR "I64x" +# define SCNXPTR "I64X" +#else // _WIN64 ][ +# define SCNoPTR "lo" +# define SCNuPTR "lu" +# define SCNxPTR "lx" +# define SCNXPTR "lX" +#endif // _WIN64 ] + +#endif // __STDC_FORMAT_MACROS ] + +// 7.8.2 Functions for greatest-width integer types + +// 7.8.2.1 The imaxabs function +#define imaxabs _abs64 + +// 7.8.2.2 The imaxdiv function + +// This is modified version of div() function from Microsoft's div.c found +// in %MSVC.NET%\crt\src\div.c +#ifdef STATIC_IMAXDIV // [ +static +#else // STATIC_IMAXDIV ][ +_inline +#endif // STATIC_IMAXDIV ] +imaxdiv_t __cdecl imaxdiv(intmax_t numer, intmax_t denom) +{ + imaxdiv_t result; + + result.quot = numer / denom; + result.rem = numer % denom; + + if (numer < 0 && result.rem > 0) { + // did division wrong; must fix up + ++result.quot; + result.rem -= denom; + } + + return result; +} + +// 7.8.2.3 The strtoimax and strtoumax functions +#define strtoimax _strtoi64 +#define strtoumax _strtoui64 + +// 7.8.2.4 The wcstoimax and wcstoumax functions +#define wcstoimax _wcstoi64 +#define wcstoumax _wcstoui64 + + +#endif // _MSC_INTTYPES_H_ ] diff --git a/pandas/src/ms_stdint.h b/pandas/src/ms_stdint.h new file mode 100644 index 00000000..c66fbb81 --- /dev/null +++ b/pandas/src/ms_stdint.h @@ -0,0 +1,247 @@ +// ISO C9x compliant stdint.h for Microsoft Visual Studio +// Based on ISO/IEC 9899:TC2 Committee draft (May 6, 2005) WG14/N1124 +// +// Copyright (c) 2006-2008 Alexander Chemeris +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. The name of the author may be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef _MSC_VER // [ +#error "Use this header only with Microsoft Visual C++ compilers!" +#endif // _MSC_VER ] + +#ifndef _MSC_STDINT_H_ // [ +#define _MSC_STDINT_H_ + +#if _MSC_VER > 1000 +#pragma once +#endif + +#include + +// For Visual Studio 6 in C++ mode and for many Visual Studio versions when +// compiling for ARM we should wrap include with 'extern "C++" {}' +// or compiler give many errors like this: +// error C2733: second C linkage of overloaded function 'wmemchr' not allowed +#ifdef __cplusplus +extern "C" { +#endif +# include +#ifdef __cplusplus +} +#endif + +// Define _W64 macros to mark types changing their size, like intptr_t. +#ifndef _W64 +# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 +# define _W64 __w64 +# else +# define _W64 +# endif +#endif + + +// 7.18.1 Integer types + +// 7.18.1.1 Exact-width integer types + +// Visual Studio 6 and Embedded Visual C++ 4 doesn't +// realize that, e.g. char has the same size as __int8 +// so we give up on __intX for them. +#if (_MSC_VER < 1300) + typedef signed char int8_t; + typedef signed short int16_t; + typedef signed int int32_t; + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; +#else + typedef signed __int8 int8_t; + typedef signed __int16 int16_t; + typedef signed __int32 int32_t; + typedef unsigned __int8 uint8_t; + typedef unsigned __int16 uint16_t; + typedef unsigned __int32 uint32_t; +#endif +typedef signed __int64 int64_t; +typedef unsigned __int64 uint64_t; + + +// 7.18.1.2 Minimum-width integer types +typedef int8_t int_least8_t; +typedef int16_t int_least16_t; +typedef int32_t int_least32_t; +typedef int64_t int_least64_t; +typedef uint8_t uint_least8_t; +typedef uint16_t uint_least16_t; +typedef uint32_t uint_least32_t; +typedef uint64_t uint_least64_t; + +// 7.18.1.3 Fastest minimum-width integer types +typedef int8_t int_fast8_t; +typedef int16_t int_fast16_t; +typedef int32_t int_fast32_t; +typedef int64_t int_fast64_t; +typedef uint8_t uint_fast8_t; +typedef uint16_t uint_fast16_t; +typedef uint32_t uint_fast32_t; +typedef uint64_t uint_fast64_t; + +// 7.18.1.4 Integer types capable of holding object pointers +#ifdef _WIN64 // [ + typedef signed __int64 intptr_t; + typedef unsigned __int64 uintptr_t; +#else // _WIN64 ][ + typedef _W64 signed int intptr_t; + typedef _W64 unsigned int uintptr_t; +#endif // _WIN64 ] + +// 7.18.1.5 Greatest-width integer types +typedef int64_t intmax_t; +typedef uint64_t uintmax_t; + + +// 7.18.2 Limits of specified-width integer types + +#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 + +// 7.18.2.1 Limits of exact-width integer types +#define INT8_MIN ((int8_t)_I8_MIN) +#define INT8_MAX _I8_MAX +#define INT16_MIN ((int16_t)_I16_MIN) +#define INT16_MAX _I16_MAX +#define INT32_MIN ((int32_t)_I32_MIN) +#define INT32_MAX _I32_MAX +#define INT64_MIN ((int64_t)_I64_MIN) +#define INT64_MAX _I64_MAX +#define UINT8_MAX _UI8_MAX +#define UINT16_MAX _UI16_MAX +#define UINT32_MAX _UI32_MAX +#define UINT64_MAX _UI64_MAX + +// 7.18.2.2 Limits of minimum-width integer types +#define INT_LEAST8_MIN INT8_MIN +#define INT_LEAST8_MAX INT8_MAX +#define INT_LEAST16_MIN INT16_MIN +#define INT_LEAST16_MAX INT16_MAX +#define INT_LEAST32_MIN INT32_MIN +#define INT_LEAST32_MAX INT32_MAX +#define INT_LEAST64_MIN INT64_MIN +#define INT_LEAST64_MAX INT64_MAX +#define UINT_LEAST8_MAX UINT8_MAX +#define UINT_LEAST16_MAX UINT16_MAX +#define UINT_LEAST32_MAX UINT32_MAX +#define UINT_LEAST64_MAX UINT64_MAX + +// 7.18.2.3 Limits of fastest minimum-width integer types +#define INT_FAST8_MIN INT8_MIN +#define INT_FAST8_MAX INT8_MAX +#define INT_FAST16_MIN INT16_MIN +#define INT_FAST16_MAX INT16_MAX +#define INT_FAST32_MIN INT32_MIN +#define INT_FAST32_MAX INT32_MAX +#define INT_FAST64_MIN INT64_MIN +#define INT_FAST64_MAX INT64_MAX +#define UINT_FAST8_MAX UINT8_MAX +#define UINT_FAST16_MAX UINT16_MAX +#define UINT_FAST32_MAX UINT32_MAX +#define UINT_FAST64_MAX UINT64_MAX + +// 7.18.2.4 Limits of integer types capable of holding object pointers +#ifdef _WIN64 // [ +# define INTPTR_MIN INT64_MIN +# define INTPTR_MAX INT64_MAX +# define UINTPTR_MAX UINT64_MAX +#else // _WIN64 ][ +# define INTPTR_MIN INT32_MIN +# define INTPTR_MAX INT32_MAX +# define UINTPTR_MAX UINT32_MAX +#endif // _WIN64 ] + +// 7.18.2.5 Limits of greatest-width integer types +#define INTMAX_MIN INT64_MIN +#define INTMAX_MAX INT64_MAX +#define UINTMAX_MAX UINT64_MAX + +// 7.18.3 Limits of other integer types + +#ifdef _WIN64 // [ +# define PTRDIFF_MIN _I64_MIN +# define PTRDIFF_MAX _I64_MAX +#else // _WIN64 ][ +# define PTRDIFF_MIN _I32_MIN +# define PTRDIFF_MAX _I32_MAX +#endif // _WIN64 ] + +#define SIG_ATOMIC_MIN INT_MIN +#define SIG_ATOMIC_MAX INT_MAX + +#ifndef SIZE_MAX // [ +# ifdef _WIN64 // [ +# define SIZE_MAX _UI64_MAX +# else // _WIN64 ][ +# define SIZE_MAX _UI32_MAX +# endif // _WIN64 ] +#endif // SIZE_MAX ] + +// WCHAR_MIN and WCHAR_MAX are also defined in +#ifndef WCHAR_MIN // [ +# define WCHAR_MIN 0 +#endif // WCHAR_MIN ] +#ifndef WCHAR_MAX // [ +# define WCHAR_MAX _UI16_MAX +#endif // WCHAR_MAX ] + +#define WINT_MIN 0 +#define WINT_MAX _UI16_MAX + +#endif // __STDC_LIMIT_MACROS ] + + +// 7.18.4 Limits of other integer types + +#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 + +// 7.18.4.1 Macros for minimum-width integer constants + +#define INT8_C(val) val##i8 +#define INT16_C(val) val##i16 +#define INT32_C(val) val##i32 +#define INT64_C(val) val##i64 + +#define UINT8_C(val) val##ui8 +#define UINT16_C(val) val##ui16 +#define UINT32_C(val) val##ui32 +#define UINT64_C(val) val##ui64 + +// 7.18.4.2 Macros for greatest-width integer constants +#define INTMAX_C INT64_C +#define UINTMAX_C UINT64_C + +#endif // __STDC_CONSTANT_MACROS ] + + +#endif // _MSC_STDINT_H_ ] diff --git a/pandas/src/numpy.pxd b/pandas/src/numpy.pxd new file mode 100644 index 00000000..45c2fc18 --- /dev/null +++ b/pandas/src/numpy.pxd @@ -0,0 +1,980 @@ +# NumPy static imports for Cython +# +# If any of the PyArray_* functions are called, import_array must be +# called first. +# +# This also defines backwards-compatability buffer acquisition +# code for use in Python 2.x (or Python <= 2.5 when NumPy starts +# implementing PEP-3118 directly). +# +# Because of laziness, the format string of the buffer is statically +# allocated. Increase the size if this is not enough, or submit a +# patch to do this properly. +# +# Author: Dag Sverre Seljebotn +# + +DEF _buffer_format_string_len = 255 + +cimport cpython.buffer as pybuf +from cpython.ref cimport Py_INCREF, Py_XDECREF +from cpython.object cimport PyObject +cimport libc.stdlib as stdlib +cimport libc.stdio as stdio + +cdef extern from "Python.h": + ctypedef int Py_intptr_t + +cdef extern from "numpy/arrayobject.h": + ctypedef Py_intptr_t npy_intp + ctypedef size_t npy_uintp + + cdef enum NPY_TYPES: + NPY_BOOL + NPY_BYTE + NPY_UBYTE + NPY_SHORT + NPY_USHORT + NPY_INT + NPY_UINT + NPY_LONG + NPY_ULONG + NPY_LONGLONG + NPY_ULONGLONG + NPY_FLOAT + NPY_DOUBLE + NPY_LONGDOUBLE + NPY_CFLOAT + NPY_CDOUBLE + NPY_CLONGDOUBLE + NPY_OBJECT + NPY_STRING + NPY_UNICODE + NPY_VOID + NPY_NTYPES + NPY_NOTYPE + + NPY_INT8 + NPY_INT16 + NPY_INT32 + NPY_INT64 + NPY_INT128 + NPY_INT256 + NPY_UINT8 + NPY_UINT16 + NPY_UINT32 + NPY_UINT64 + NPY_UINT128 + NPY_UINT256 + NPY_FLOAT16 + NPY_FLOAT32 + NPY_FLOAT64 + NPY_FLOAT80 + NPY_FLOAT96 + NPY_FLOAT128 + NPY_FLOAT256 + NPY_COMPLEX32 + NPY_COMPLEX64 + NPY_COMPLEX128 + NPY_COMPLEX160 + NPY_COMPLEX192 + NPY_COMPLEX256 + NPY_COMPLEX512 + + NPY_DATETIME + + NPY_INTP + + ctypedef enum NPY_ORDER: + NPY_ANYORDER + NPY_CORDER + NPY_FORTRANORDER + + ctypedef enum NPY_CLIPMODE: + NPY_CLIP + NPY_WRAP + NPY_RAISE + + ctypedef enum NPY_SCALARKIND: + NPY_NOSCALAR, + NPY_BOOL_SCALAR, + NPY_INTPOS_SCALAR, + NPY_INTNEG_SCALAR, + NPY_FLOAT_SCALAR, + NPY_COMPLEX_SCALAR, + NPY_OBJECT_SCALAR + + ctypedef enum NPY_SORTKIND: + NPY_QUICKSORT + NPY_HEAPSORT + NPY_MERGESORT + + ctypedef enum NPY_SEARCHSIDE: + NPY_SEARCHLEFT + NPY_SEARCHRIGHT + + enum: + NPY_C_CONTIGUOUS + NPY_F_CONTIGUOUS + NPY_CONTIGUOUS + NPY_FORTRAN + NPY_OWNDATA + NPY_FORCECAST + NPY_ENSURECOPY + NPY_ENSUREARRAY + NPY_ELEMENTSTRIDES + NPY_ALIGNED + NPY_NOTSWAPPED + NPY_WRITEABLE + NPY_UPDATEIFCOPY + NPY_ARR_HAS_DESCR + + NPY_BEHAVED + NPY_BEHAVED_NS + NPY_CARRAY + NPY_CARRAY_RO + NPY_FARRAY + NPY_FARRAY_RO + NPY_DEFAULT + + NPY_IN_ARRAY + NPY_OUT_ARRAY + NPY_INOUT_ARRAY + NPY_IN_FARRAY + NPY_OUT_FARRAY + NPY_INOUT_FARRAY + + NPY_UPDATE_ALL + + cdef enum: + NPY_MAXDIMS + + npy_intp NPY_MAX_ELSIZE + + ctypedef void (*PyArray_VectorUnaryFunc)(void *, void *, npy_intp, void *, void *) + + ctypedef class numpy.dtype [object PyArray_Descr]: + # Use PyDataType_* macros when possible, however there are no macros + # for accessing some of the fields, so some are defined. Please + # ask on cython-dev if you need more. + cdef int type_num + cdef int itemsize "elsize" + cdef char byteorder + cdef object fields + cdef tuple names + + ctypedef extern class numpy.flatiter [object PyArrayIterObject]: + # Use through macros + pass + + ctypedef extern class numpy.broadcast [object PyArrayMultiIterObject]: + # Use through macros + pass + + ctypedef struct PyArrayObject: + # For use in situations where ndarray can't replace PyArrayObject*, + # like PyArrayObject**. + pass + + ctypedef class numpy.ndarray [object PyArrayObject]: + cdef __cythonbufferdefaults__ = {"mode": "strided"} + + cdef: + # Only taking a few of the most commonly used and stable fields. + # One should use PyArray_* macros instead to access the C fields. + char *data + int ndim "nd" + npy_intp *shape "dimensions" + npy_intp *strides + dtype descr + PyObject* base + + # Note: This syntax (function definition in pxd files) is an + # experimental exception made for __getbuffer__ and __releasebuffer__ + # -- the details of this may change. + def __getbuffer__(ndarray self, Py_buffer* info, int flags): + # This implementation of getbuffer is geared towards Cython + # requirements, and does not yet fullfill the PEP. + # In particular strided access is always provided regardless + # of flags + + if info == NULL: return + + cdef int copy_shape, i, ndim + cdef int endian_detector = 1 + cdef bint little_endian = ((&endian_detector)[0] != 0) + + ndim = PyArray_NDIM(self) + + if sizeof(npy_intp) != sizeof(Py_ssize_t): + copy_shape = 1 + else: + copy_shape = 0 + + if ((flags & pybuf.PyBUF_C_CONTIGUOUS == pybuf.PyBUF_C_CONTIGUOUS) + and not PyArray_CHKFLAGS(self, NPY_C_CONTIGUOUS)): + raise ValueError(u"ndarray is not C contiguous") + + if ((flags & pybuf.PyBUF_F_CONTIGUOUS == pybuf.PyBUF_F_CONTIGUOUS) + and not PyArray_CHKFLAGS(self, NPY_F_CONTIGUOUS)): + raise ValueError(u"ndarray is not Fortran contiguous") + + info.buf = PyArray_DATA(self) + info.ndim = ndim + if copy_shape: + # Allocate new buffer for strides and shape info. + # This is allocated as one block, strides first. + info.strides = stdlib.malloc(sizeof(Py_ssize_t) * ndim * 2) + info.shape = info.strides + ndim + for i in range(ndim): + info.strides[i] = PyArray_STRIDES(self)[i] + info.shape[i] = PyArray_DIMS(self)[i] + else: + info.strides = PyArray_STRIDES(self) + info.shape = PyArray_DIMS(self) + info.suboffsets = NULL + info.itemsize = PyArray_ITEMSIZE(self) + info.readonly = not PyArray_ISWRITEABLE(self) + + cdef int t + cdef char* f = NULL + cdef dtype descr = self.descr + cdef list stack + cdef int offset + + cdef bint hasfields = PyDataType_HASFIELDS(descr) + + if not hasfields and not copy_shape: + # do not call releasebuffer + info.obj = None + else: + # need to call releasebuffer + info.obj = self + + if not hasfields: + t = descr.type_num + if ((descr.byteorder == '>' and little_endian) or + (descr.byteorder == '<' and not little_endian)): + raise ValueError(u"Non-native byte order not supported") + if t == NPY_BYTE: f = "b" + elif t == NPY_UBYTE: f = "B" + elif t == NPY_SHORT: f = "h" + elif t == NPY_USHORT: f = "H" + elif t == NPY_INT: f = "i" + elif t == NPY_UINT: f = "I" + elif t == NPY_LONG: f = "l" + elif t == NPY_ULONG: f = "L" + elif t == NPY_LONGLONG: f = "q" + elif t == NPY_ULONGLONG: f = "Q" + elif t == NPY_FLOAT: f = "f" + elif t == NPY_DOUBLE: f = "d" + elif t == NPY_LONGDOUBLE: f = "g" + elif t == NPY_CFLOAT: f = "Zf" + elif t == NPY_CDOUBLE: f = "Zd" + elif t == NPY_CLONGDOUBLE: f = "Zg" + elif t == NPY_OBJECT: f = "O" + else: + raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) + info.format = f + return + else: + info.format = stdlib.malloc(_buffer_format_string_len) + info.format[0] = '^' # Native data types, manual alignment + offset = 0 + f = _util_dtypestring(descr, info.format + 1, + info.format + _buffer_format_string_len, + &offset) + f[0] = 0 # Terminate format string + + def __releasebuffer__(ndarray self, Py_buffer* info): + if PyArray_HASFIELDS(self): + stdlib.free(info.format) + if sizeof(npy_intp) != sizeof(Py_ssize_t): + stdlib.free(info.strides) + # info.shape was stored after info.strides in the same block + + + ctypedef signed char npy_bool + + ctypedef signed char npy_byte + ctypedef signed short npy_short + ctypedef signed int npy_int + ctypedef signed long npy_long + ctypedef signed long long npy_longlong + + ctypedef unsigned char npy_ubyte + ctypedef unsigned short npy_ushort + ctypedef unsigned int npy_uint + ctypedef unsigned long npy_ulong + ctypedef unsigned long long npy_ulonglong + + ctypedef float npy_float + ctypedef double npy_double + ctypedef long double npy_longdouble + + ctypedef signed char npy_int8 + ctypedef signed short npy_int16 + ctypedef signed int npy_int32 + ctypedef signed long long npy_int64 + ctypedef signed long long npy_int96 + ctypedef signed long long npy_int128 + + ctypedef unsigned char npy_uint8 + ctypedef unsigned short npy_uint16 + ctypedef unsigned int npy_uint32 + ctypedef unsigned long long npy_uint64 + ctypedef unsigned long long npy_uint96 + ctypedef unsigned long long npy_uint128 + + ctypedef float npy_float32 + ctypedef double npy_float64 + ctypedef long double npy_float80 + ctypedef long double npy_float96 + ctypedef long double npy_float128 + + ctypedef struct npy_cfloat: + double real + double imag + + ctypedef struct npy_cdouble: + double real + double imag + + ctypedef struct npy_clongdouble: + double real + double imag + + ctypedef struct npy_complex64: + double real + double imag + + ctypedef struct npy_complex128: + double real + double imag + + ctypedef struct npy_complex160: + double real + double imag + + ctypedef struct npy_complex192: + double real + double imag + + ctypedef struct npy_complex256: + double real + double imag + + ctypedef struct PyArray_Dims: + npy_intp *ptr + int len + + void import_array() + + # + # Macros from ndarrayobject.h + # + bint PyArray_CHKFLAGS(ndarray m, int flags) + bint PyArray_ISCONTIGUOUS(ndarray m) + bint PyArray_ISWRITEABLE(ndarray m) + bint PyArray_ISALIGNED(ndarray m) + + int PyArray_NDIM(ndarray) + bint PyArray_ISONESEGMENT(ndarray) + bint PyArray_ISFORTRAN(ndarray) + int PyArray_FORTRANIF(ndarray) + + void* PyArray_DATA(ndarray) + char* PyArray_BYTES(ndarray) + npy_intp* PyArray_DIMS(ndarray) + npy_intp* PyArray_STRIDES(ndarray) + npy_intp PyArray_DIM(ndarray, size_t) + npy_intp PyArray_STRIDE(ndarray, size_t) + + # object PyArray_BASE(ndarray) wrong refcount semantics + # dtype PyArray_DESCR(ndarray) wrong refcount semantics + int PyArray_FLAGS(ndarray) + npy_intp PyArray_ITEMSIZE(ndarray) + int PyArray_TYPE(ndarray arr) + + object PyArray_GETITEM(ndarray arr, void *itemptr) + int PyArray_SETITEM(ndarray arr, void *itemptr, object obj) + + bint PyTypeNum_ISBOOL(int) + bint PyTypeNum_ISUNSIGNED(int) + bint PyTypeNum_ISSIGNED(int) + bint PyTypeNum_ISINTEGER(int) + bint PyTypeNum_ISFLOAT(int) + bint PyTypeNum_ISNUMBER(int) + bint PyTypeNum_ISSTRING(int) + bint PyTypeNum_ISCOMPLEX(int) + bint PyTypeNum_ISPYTHON(int) + bint PyTypeNum_ISFLEXIBLE(int) + bint PyTypeNum_ISUSERDEF(int) + bint PyTypeNum_ISEXTENDED(int) + bint PyTypeNum_ISOBJECT(int) + + bint PyDataType_ISBOOL(dtype) + bint PyDataType_ISUNSIGNED(dtype) + bint PyDataType_ISSIGNED(dtype) + bint PyDataType_ISINTEGER(dtype) + bint PyDataType_ISFLOAT(dtype) + bint PyDataType_ISNUMBER(dtype) + bint PyDataType_ISSTRING(dtype) + bint PyDataType_ISCOMPLEX(dtype) + bint PyDataType_ISPYTHON(dtype) + bint PyDataType_ISFLEXIBLE(dtype) + bint PyDataType_ISUSERDEF(dtype) + bint PyDataType_ISEXTENDED(dtype) + bint PyDataType_ISOBJECT(dtype) + bint PyDataType_HASFIELDS(dtype) + + bint PyArray_ISBOOL(ndarray) + bint PyArray_ISUNSIGNED(ndarray) + bint PyArray_ISSIGNED(ndarray) + bint PyArray_ISINTEGER(ndarray) + bint PyArray_ISFLOAT(ndarray) + bint PyArray_ISNUMBER(ndarray) + bint PyArray_ISSTRING(ndarray) + bint PyArray_ISCOMPLEX(ndarray) + bint PyArray_ISPYTHON(ndarray) + bint PyArray_ISFLEXIBLE(ndarray) + bint PyArray_ISUSERDEF(ndarray) + bint PyArray_ISEXTENDED(ndarray) + bint PyArray_ISOBJECT(ndarray) + bint PyArray_HASFIELDS(ndarray) + + bint PyArray_ISVARIABLE(ndarray) + + bint PyArray_SAFEALIGNEDCOPY(ndarray) + bint PyArray_ISNBO(ndarray) + bint PyArray_IsNativeByteOrder(ndarray) + bint PyArray_ISNOTSWAPPED(ndarray) + bint PyArray_ISBYTESWAPPED(ndarray) + + bint PyArray_FLAGSWAP(ndarray, int) + + bint PyArray_ISCARRAY(ndarray) + bint PyArray_ISCARRAY_RO(ndarray) + bint PyArray_ISFARRAY(ndarray) + bint PyArray_ISFARRAY_RO(ndarray) + bint PyArray_ISBEHAVED(ndarray) + bint PyArray_ISBEHAVED_RO(ndarray) + + + bint PyDataType_ISNOTSWAPPED(dtype) + bint PyDataType_ISBYTESWAPPED(dtype) + + bint PyArray_DescrCheck(object) + + bint PyArray_Check(object) + bint PyArray_CheckExact(object) + + # Cannot be supported due to out arg: + # bint PyArray_HasArrayInterfaceType(object, dtype, object, object&) + # bint PyArray_HasArrayInterface(op, out) + + + bint PyArray_IsZeroDim(object) + # Cannot be supported due to ## ## in macro: + # bint PyArray_IsScalar(object, verbatim work) + bint PyArray_CheckScalar(object) + bint PyArray_IsPythonNumber(object) + bint PyArray_IsPythonScalar(object) + bint PyArray_IsAnyScalar(object) + bint PyArray_CheckAnyScalar(object) + ndarray PyArray_GETCONTIGUOUS(ndarray) + bint PyArray_SAMESHAPE(ndarray, ndarray) + npy_intp PyArray_SIZE(ndarray) + npy_intp PyArray_NBYTES(ndarray) + + object PyArray_FROM_O(object) + object PyArray_FROM_OF(object m, int flags) + bint PyArray_FROM_OT(object m, int type) + bint PyArray_FROM_OTF(object m, int type, int flags) + object PyArray_FROMANY(object m, int type, int min, int max, int flags) + object PyArray_ZEROS(int nd, npy_intp* dims, int type, int fortran) + object PyArray_EMPTY(int nd, npy_intp* dims, int type, int fortran) + void PyArray_FILLWBYTE(object, int val) + npy_intp PyArray_REFCOUNT(object) + object PyArray_ContiguousFromAny(op, int, int min_depth, int max_depth) + unsigned char PyArray_EquivArrTypes(ndarray a1, ndarray a2) + bint PyArray_EquivByteorders(int b1, int b2) + object PyArray_SimpleNew(int nd, npy_intp* dims, int typenum) + object PyArray_SimpleNewFromData(int nd, npy_intp* dims, int typenum, void* data) + #object PyArray_SimpleNewFromDescr(int nd, npy_intp* dims, dtype descr) + object PyArray_ToScalar(void* data, ndarray arr) + + void* PyArray_GETPTR1(ndarray m, npy_intp i) + void* PyArray_GETPTR2(ndarray m, npy_intp i, npy_intp j) + void* PyArray_GETPTR3(ndarray m, npy_intp i, npy_intp j, npy_intp k) + void* PyArray_GETPTR4(ndarray m, npy_intp i, npy_intp j, npy_intp k, npy_intp l) + + void PyArray_XDECREF_ERR(ndarray) + # Cannot be supported due to out arg + # void PyArray_DESCR_REPLACE(descr) + + + object PyArray_Copy(ndarray) + object PyArray_FromObject(object op, int type, int min_depth, int max_depth) + object PyArray_ContiguousFromObject(object op, int type, int min_depth, int max_depth) + object PyArray_CopyFromObject(object op, int type, int min_depth, int max_depth) + + object PyArray_Cast(ndarray mp, int type_num) + object PyArray_Take(ndarray ap, object items, int axis) + object PyArray_Put(ndarray ap, object items, object values) + + void PyArray_ITER_RESET(flatiter it) nogil + void PyArray_ITER_NEXT(flatiter it) nogil + void PyArray_ITER_GOTO(flatiter it, npy_intp* destination) nogil + void PyArray_ITER_GOTO1D(flatiter it, npy_intp ind) nogil + void* PyArray_ITER_DATA(flatiter it) nogil + bint PyArray_ITER_NOTDONE(flatiter it) nogil + + void PyArray_MultiIter_RESET(broadcast multi) nogil + void PyArray_MultiIter_NEXT(broadcast multi) nogil + void PyArray_MultiIter_GOTO(broadcast multi, npy_intp dest) nogil + void PyArray_MultiIter_GOTO1D(broadcast multi, npy_intp ind) nogil + void* PyArray_MultiIter_DATA(broadcast multi, npy_intp i) nogil + void PyArray_MultiIter_NEXTi(broadcast multi, npy_intp i) nogil + bint PyArray_MultiIter_NOTDONE(broadcast multi) nogil + + # Functions from __multiarray_api.h + + # Functions taking dtype and returning object/ndarray are disabled + # for now as they steal dtype references. I'm conservative and disable + # more than is probably needed until it can be checked further. + int PyArray_SetNumericOps (object) + object PyArray_GetNumericOps () + int PyArray_INCREF (ndarray) + int PyArray_XDECREF (ndarray) + void PyArray_SetStringFunction (object, int) + dtype PyArray_DescrFromType (int) + object PyArray_TypeObjectFromType (int) + char * PyArray_Zero (ndarray) + char * PyArray_One (ndarray) + #object PyArray_CastToType (ndarray, dtype, int) + int PyArray_CastTo (ndarray, ndarray) + int PyArray_CastAnyTo (ndarray, ndarray) + int PyArray_CanCastSafely (int, int) + npy_bool PyArray_CanCastTo (dtype, dtype) + int PyArray_ObjectType (object, int) + dtype PyArray_DescrFromObject (object, dtype) + #ndarray* PyArray_ConvertToCommonType (object, int *) + dtype PyArray_DescrFromScalar (object) + dtype PyArray_DescrFromTypeObject (object) + npy_intp PyArray_Size (object) + #object PyArray_Scalar (void *, dtype, object) + #object PyArray_FromScalar (object, dtype) + void PyArray_ScalarAsCtype (object, void *) + #int PyArray_CastScalarToCtype (object, void *, dtype) + #int PyArray_CastScalarDirect (object, dtype, void *, int) + object PyArray_ScalarFromObject (object) + #PyArray_VectorUnaryFunc * PyArray_GetCastFunc (dtype, int) + object PyArray_FromDims (int, int *, int) + #object PyArray_FromDimsAndDataAndDescr (int, int *, dtype, char *) + #object PyArray_FromAny (object, dtype, int, int, int, object) + object PyArray_EnsureArray (object) + object PyArray_EnsureAnyArray (object) + #object PyArray_FromFile (stdio.FILE *, dtype, npy_intp, char *) + #object PyArray_FromString (char *, npy_intp, dtype, npy_intp, char *) + #object PyArray_FromBuffer (object, dtype, npy_intp, npy_intp) + #object PyArray_FromIter (object, dtype, npy_intp) + object PyArray_Return (ndarray) + #object PyArray_GetField (ndarray, dtype, int) + #int PyArray_SetField (ndarray, dtype, int, object) + object PyArray_Byteswap (ndarray, npy_bool) + object PyArray_Resize (ndarray, PyArray_Dims *, int, NPY_ORDER) + int PyArray_MoveInto (ndarray, ndarray) + int PyArray_CopyInto (ndarray, ndarray) + int PyArray_CopyAnyInto (ndarray, ndarray) + int PyArray_CopyObject (ndarray, object) + object PyArray_NewCopy (ndarray, NPY_ORDER) + object PyArray_ToList (ndarray) + object PyArray_ToString (ndarray, NPY_ORDER) + int PyArray_ToFile (ndarray, stdio.FILE *, char *, char *) + int PyArray_Dump (object, object, int) + object PyArray_Dumps (object, int) + int PyArray_ValidType (int) + void PyArray_UpdateFlags (ndarray, int) + object PyArray_New (type, int, npy_intp *, int, npy_intp *, void *, int, int, object) + #object PyArray_NewFromDescr (type, dtype, int, npy_intp *, npy_intp *, void *, int, object) + #dtype PyArray_DescrNew (dtype) + dtype PyArray_DescrNewFromType (int) + double PyArray_GetPriority (object, double) + object PyArray_IterNew (object) + object PyArray_MultiIterNew (int, ...) + + int PyArray_PyIntAsInt (object) + npy_intp PyArray_PyIntAsIntp (object) + int PyArray_Broadcast (broadcast) + void PyArray_FillObjectArray (ndarray, object) + int PyArray_FillWithScalar (ndarray, object) + npy_bool PyArray_CheckStrides (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *) + dtype PyArray_DescrNewByteorder (dtype, char) + object PyArray_IterAllButAxis (object, int *) + #object PyArray_CheckFromAny (object, dtype, int, int, int, object) + #object PyArray_FromArray (ndarray, dtype, int) + object PyArray_FromInterface (object) + object PyArray_FromStructInterface (object) + #object PyArray_FromArrayAttr (object, dtype, object) + #NPY_SCALARKIND PyArray_ScalarKind (int, ndarray*) + int PyArray_CanCoerceScalar (int, int, NPY_SCALARKIND) + object PyArray_NewFlagsObject (object) + npy_bool PyArray_CanCastScalar (type, type) + #int PyArray_CompareUCS4 (npy_ucs4 *, npy_ucs4 *, register size_t) + int PyArray_RemoveSmallest (broadcast) + int PyArray_ElementStrides (object) + void PyArray_Item_INCREF (char *, dtype) + void PyArray_Item_XDECREF (char *, dtype) + object PyArray_FieldNames (object) + object PyArray_Transpose (ndarray, PyArray_Dims *) + object PyArray_TakeFrom (ndarray, object, int, ndarray, NPY_CLIPMODE) + object PyArray_PutTo (ndarray, object, object, NPY_CLIPMODE) + object PyArray_PutMask (ndarray, object, object) + object PyArray_Repeat (ndarray, object, int) + object PyArray_Choose (ndarray, object, ndarray, NPY_CLIPMODE) + int PyArray_Sort (ndarray, int, NPY_SORTKIND) + object PyArray_ArgSort (ndarray, int, NPY_SORTKIND) + object PyArray_SearchSorted (ndarray, object, NPY_SEARCHSIDE) + object PyArray_ArgMax (ndarray, int, ndarray) + object PyArray_ArgMin (ndarray, int, ndarray) + object PyArray_Reshape (ndarray, object) + object PyArray_Newshape (ndarray, PyArray_Dims *, NPY_ORDER) + object PyArray_Squeeze (ndarray) + #object PyArray_View (ndarray, dtype, type) + object PyArray_SwapAxes (ndarray, int, int) + object PyArray_Max (ndarray, int, ndarray) + object PyArray_Min (ndarray, int, ndarray) + object PyArray_Ptp (ndarray, int, ndarray) + object PyArray_Mean (ndarray, int, int, ndarray) + object PyArray_Trace (ndarray, int, int, int, int, ndarray) + object PyArray_Diagonal (ndarray, int, int, int) + object PyArray_Clip (ndarray, object, object, ndarray) + object PyArray_Conjugate (ndarray, ndarray) + object PyArray_Nonzero (ndarray) + object PyArray_Std (ndarray, int, int, ndarray, int) + object PyArray_Sum (ndarray, int, int, ndarray) + object PyArray_CumSum (ndarray, int, int, ndarray) + object PyArray_Prod (ndarray, int, int, ndarray) + object PyArray_CumProd (ndarray, int, int, ndarray) + object PyArray_All (ndarray, int, ndarray) + object PyArray_Any (ndarray, int, ndarray) + object PyArray_Compress (ndarray, object, int, ndarray) + object PyArray_Flatten (ndarray, NPY_ORDER) + object PyArray_Ravel (ndarray, NPY_ORDER) + npy_intp PyArray_MultiplyList (npy_intp *, int) + int PyArray_MultiplyIntList (int *, int) + void * PyArray_GetPtr (ndarray, npy_intp*) + int PyArray_CompareLists (npy_intp *, npy_intp *, int) + #int PyArray_AsCArray (object*, void *, npy_intp *, int, dtype) + #int PyArray_As1D (object*, char **, int *, int) + #int PyArray_As2D (object*, char ***, int *, int *, int) + int PyArray_Free (object, void *) + #int PyArray_Converter (object, object*) + int PyArray_IntpFromSequence (object, npy_intp *, int) + object PyArray_Concatenate (object, int) + object PyArray_InnerProduct (object, object) + object PyArray_MatrixProduct (object, object) + object PyArray_CopyAndTranspose (object) + object PyArray_Correlate (object, object, int) + int PyArray_TypestrConvert (int, int) + #int PyArray_DescrConverter (object, dtype*) + #int PyArray_DescrConverter2 (object, dtype*) + int PyArray_IntpConverter (object, PyArray_Dims *) + #int PyArray_BufferConverter (object, chunk) + int PyArray_AxisConverter (object, int *) + int PyArray_BoolConverter (object, npy_bool *) + int PyArray_ByteorderConverter (object, char *) + int PyArray_OrderConverter (object, NPY_ORDER *) + unsigned char PyArray_EquivTypes (dtype, dtype) + #object PyArray_Zeros (int, npy_intp *, dtype, int) + #object PyArray_Empty (int, npy_intp *, dtype, int) + object PyArray_Where (object, object, object) + object PyArray_Arange (double, double, double, int) + #object PyArray_ArangeObj (object, object, object, dtype) + int PyArray_SortkindConverter (object, NPY_SORTKIND *) + object PyArray_LexSort (object, int) + object PyArray_Round (ndarray, int, ndarray) + unsigned char PyArray_EquivTypenums (int, int) + int PyArray_RegisterDataType (dtype) + int PyArray_RegisterCastFunc (dtype, int, PyArray_VectorUnaryFunc *) + int PyArray_RegisterCanCast (dtype, int, NPY_SCALARKIND) + #void PyArray_InitArrFuncs (PyArray_ArrFuncs *) + object PyArray_IntTupleFromIntp (int, npy_intp *) + int PyArray_TypeNumFromName (char *) + int PyArray_ClipmodeConverter (object, NPY_CLIPMODE *) + #int PyArray_OutputConverter (object, ndarray*) + object PyArray_BroadcastToShape (object, npy_intp *, int) + void _PyArray_SigintHandler (int) + void* _PyArray_GetSigintBuf () + #int PyArray_DescrAlignConverter (object, dtype*) + #int PyArray_DescrAlignConverter2 (object, dtype*) + int PyArray_SearchsideConverter (object, void *) + object PyArray_CheckAxis (ndarray, int *, int) + npy_intp PyArray_OverflowMultiplyList (npy_intp *, int) + int PyArray_CompareString (char *, char *, size_t) + + +# Typedefs that matches the runtime dtype objects in +# the numpy module. + +# The ones that are commented out needs an IFDEF function +# in Cython to enable them only on the right systems. + +ctypedef npy_int8 int8_t +ctypedef npy_int16 int16_t +ctypedef npy_int32 int32_t +ctypedef npy_int64 int64_t +#ctypedef npy_int96 int96_t +#ctypedef npy_int128 int128_t + +ctypedef npy_uint8 uint8_t +ctypedef npy_uint16 uint16_t +ctypedef npy_uint32 uint32_t +ctypedef npy_uint64 uint64_t +#ctypedef npy_uint96 uint96_t +#ctypedef npy_uint128 uint128_t + +ctypedef npy_float32 float32_t +ctypedef npy_float64 float64_t +#ctypedef npy_float80 float80_t +#ctypedef npy_float128 float128_t + +ctypedef float complex complex64_t +ctypedef double complex complex128_t + +# The int types are mapped a bit surprising -- +# numpy.int corresponds to 'l' and numpy.long to 'q' +ctypedef npy_long int_t +ctypedef npy_longlong long_t +ctypedef npy_longlong longlong_t + +ctypedef npy_ulong uint_t +ctypedef npy_ulonglong ulong_t +ctypedef npy_ulonglong ulonglong_t + +ctypedef npy_intp intp_t +ctypedef npy_uintp uintp_t + +ctypedef npy_double float_t +ctypedef npy_double double_t +ctypedef npy_longdouble longdouble_t + +ctypedef npy_cfloat cfloat_t +ctypedef npy_cdouble cdouble_t +ctypedef npy_clongdouble clongdouble_t + +ctypedef npy_cdouble complex_t + +cdef inline object PyArray_MultiIterNew1(a): + return PyArray_MultiIterNew(1, a) + +cdef inline object PyArray_MultiIterNew2(a, b): + return PyArray_MultiIterNew(2, a, b) + +cdef inline object PyArray_MultiIterNew3(a, b, c): + return PyArray_MultiIterNew(3, a, b, c) + +cdef inline object PyArray_MultiIterNew4(a, b, c, d): + return PyArray_MultiIterNew(4, a, b, c, d) + +cdef inline object PyArray_MultiIterNew5(a, b, c, d, e): + return PyArray_MultiIterNew(5, a, b, c, d, e) + +cdef inline char* _util_dtypestring(dtype descr, char* f, char* end, int* offset) except NULL: + # Recursive utility function used in __getbuffer__ to get format + # string. The new location in the format string is returned. + + cdef dtype child + cdef int delta_offset + cdef tuple i + cdef int endian_detector = 1 + cdef bint little_endian = ((&endian_detector)[0] != 0) + cdef tuple fields + + for childname in descr.names: + fields = descr.fields[childname] + child, new_offset = fields + + if (end - f) - (new_offset - offset[0]) < 15: + raise RuntimeError(u"Format string allocated too short, see comment in numpy.pxd") + + if ((child.byteorder == '>' and little_endian) or + (child.byteorder == '<' and not little_endian)): + raise ValueError(u"Non-native byte order not supported") + # One could encode it in the format string and have Cython + # complain instead, BUT: < and > in format strings also imply + # standardized sizes for datatypes, and we rely on native in + # order to avoid reencoding data types based on their size. + # + # A proper PEP 3118 exporter for other clients than Cython + # must deal properly with this! + + # Output padding bytes + while offset[0] < new_offset: + f[0] = 120 # "x"; pad byte + f += 1 + offset[0] += 1 + + offset[0] += child.itemsize + + if not PyDataType_HASFIELDS(child): + t = child.type_num + if end - f < 5: + raise RuntimeError(u"Format string allocated too short.") + + # Until ticket #99 is fixed, use integers to avoid warnings + if t == NPY_BYTE: f[0] = 98 #"b" + elif t == NPY_UBYTE: f[0] = 66 #"B" + elif t == NPY_SHORT: f[0] = 104 #"h" + elif t == NPY_USHORT: f[0] = 72 #"H" + elif t == NPY_INT: f[0] = 105 #"i" + elif t == NPY_UINT: f[0] = 73 #"I" + elif t == NPY_LONG: f[0] = 108 #"l" + elif t == NPY_ULONG: f[0] = 76 #"L" + elif t == NPY_LONGLONG: f[0] = 113 #"q" + elif t == NPY_ULONGLONG: f[0] = 81 #"Q" + elif t == NPY_FLOAT: f[0] = 102 #"f" + elif t == NPY_DOUBLE: f[0] = 100 #"d" + elif t == NPY_LONGDOUBLE: f[0] = 103 #"g" + elif t == NPY_CFLOAT: f[0] = 90; f[1] = 102; f += 1 # Zf + elif t == NPY_CDOUBLE: f[0] = 90; f[1] = 100; f += 1 # Zd + elif t == NPY_CLONGDOUBLE: f[0] = 90; f[1] = 103; f += 1 # Zg + elif t == NPY_OBJECT: f[0] = 79 #"O" + else: + raise ValueError(u"unknown dtype code in numpy.pxd (%d)" % t) + f += 1 + else: + # Cython ignores struct boundary information ("T{...}"), + # so don't output it + f = _util_dtypestring(child, f, end, offset) + return f + + +# +# ufunc API +# + +cdef extern from "numpy/ufuncobject.h": + + ctypedef void (*PyUFuncGenericFunction) (char **, npy_intp *, npy_intp *, void *) + + ctypedef extern class numpy.ufunc [object PyUFuncObject]: + cdef: + int nin, nout, nargs + int identity + PyUFuncGenericFunction *functions + void **data + int ntypes + int check_return + char *name, *types + char *doc + void *ptr + PyObject *obj + PyObject *userloops + + cdef enum: + PyUFunc_Zero + PyUFunc_One + PyUFunc_None + UFUNC_ERR_IGNORE + UFUNC_ERR_WARN + UFUNC_ERR_RAISE + UFUNC_ERR_CALL + UFUNC_ERR_PRINT + UFUNC_ERR_LOG + UFUNC_MASK_DIVIDEBYZERO + UFUNC_MASK_OVERFLOW + UFUNC_MASK_UNDERFLOW + UFUNC_MASK_INVALID + UFUNC_SHIFT_DIVIDEBYZERO + UFUNC_SHIFT_OVERFLOW + UFUNC_SHIFT_UNDERFLOW + UFUNC_SHIFT_INVALID + UFUNC_FPE_DIVIDEBYZERO + UFUNC_FPE_OVERFLOW + UFUNC_FPE_UNDERFLOW + UFUNC_FPE_INVALID + UFUNC_ERR_DEFAULT + UFUNC_ERR_DEFAULT2 + + object PyUFunc_FromFuncAndData(PyUFuncGenericFunction *, + void **, char *, int, int, int, int, char *, char *, int) + int PyUFunc_RegisterLoopForType(ufunc, int, + PyUFuncGenericFunction, int *, void *) + int PyUFunc_GenericFunction \ + (ufunc, PyObject *, PyObject *, PyArrayObject **) + void PyUFunc_f_f_As_d_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_d_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_f_f \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_g_g \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_F_F_As_D_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_F_F \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_D_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_G_G \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_O_O \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_ff_f_As_dd_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_ff_f \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_dd_d \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_gg_g \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_FF_F_As_DD_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_DD_D \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_FF_F \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_GG_G \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_OO_O \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_O_O_method \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_OO_O_method \ + (char **, npy_intp *, npy_intp *, void *) + void PyUFunc_On_Om \ + (char **, npy_intp *, npy_intp *, void *) + int PyUFunc_GetPyValues \ + (char *, int *, int *, PyObject **) + int PyUFunc_checkfperr \ + (int, PyObject *, int *) + void PyUFunc_clearfperr() + int PyUFunc_getfperr() + int PyUFunc_handlefperr \ + (int, PyObject *, int, int *) + int PyUFunc_ReplaceLoopBySignature \ + (ufunc, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *) + object PyUFunc_FromFuncAndDataAndSignature \ + (PyUFuncGenericFunction *, void **, char *, int, int, int, + int, char *, char *, int, char *) + + void import_ufunc() + + +cdef inline void set_array_base(ndarray arr, object base): + cdef PyObject* baseptr + if base is None: + baseptr = NULL + else: + Py_INCREF(base) # important to do this before decref below! + baseptr = base + Py_XDECREF(arr.base) + arr.base = baseptr + +cdef inline object get_array_base(ndarray arr): + if arr.base is NULL: + return None + else: + return arr.base diff --git a/pandas/src/numpy_helper.h b/pandas/src/numpy_helper.h new file mode 100644 index 00000000..053cd8ae --- /dev/null +++ b/pandas/src/numpy_helper.h @@ -0,0 +1,163 @@ +#include "Python.h" +#include "numpy/arrayobject.h" +#include "numpy/arrayscalars.h" + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +#define PANDAS_FLOAT 0 +#define PANDAS_INT 1 +#define PANDAS_BOOL 2 +#define PANDAS_STRING 3 +#define PANDAS_OBJECT 4 +#define PANDAS_DATETIME 5 + +PANDAS_INLINE int +infer_type(PyObject* obj) { + if (PyBool_Check(obj)) { + return PANDAS_BOOL; + } + else if (PyArray_IsIntegerScalar(obj)) { + return PANDAS_INT; + } + else if (PyArray_IsScalar(obj, Datetime)) { + return PANDAS_DATETIME; + } + else if (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)) { + return PANDAS_FLOAT; + } + else if (PyString_Check(obj) || PyUnicode_Check(obj)) { + return PANDAS_STRING; + } + else { + return PANDAS_OBJECT; + } +} + +PANDAS_INLINE npy_int64 +get_nat() { + return NPY_MIN_INT64; +} + +PANDAS_INLINE npy_datetime +get_datetime64_value(PyObject* obj) { + return ((PyDatetimeScalarObject*) obj)->obval; + +} + + +PANDAS_INLINE int +is_integer_object(PyObject* obj) { + return (!PyBool_Check(obj)) && PyArray_IsIntegerScalar(obj); +// return PyArray_IsIntegerScalar(obj); +} + +PANDAS_INLINE int +is_float_object(PyObject* obj) { + return (PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating)); +} +PANDAS_INLINE int +is_complex_object(PyObject* obj) { + return (PyComplex_Check(obj) || PyArray_IsScalar(obj, ComplexFloating)); +} + +PANDAS_INLINE int +is_bool_object(PyObject* obj) { + return (PyBool_Check(obj) || PyArray_IsScalar(obj, Bool)); +} + +PANDAS_INLINE int +is_string_object(PyObject* obj) { + return (PyString_Check(obj) || PyUnicode_Check(obj)); +} + +PANDAS_INLINE int +is_datetime64_object(PyObject *obj) { + return PyArray_IsScalar(obj, Datetime); +} + +PANDAS_INLINE int +assign_value_1d(PyArrayObject* ap, Py_ssize_t _i, PyObject* v) { + npy_intp i = (npy_intp) _i; + char *item = (char *) PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); + return PyArray_DESCR(ap)->f->setitem(v, item, ap); +} + +PANDAS_INLINE PyObject* +get_value_1d(PyArrayObject* ap, Py_ssize_t i) { + char *item = (char *) PyArray_DATA(ap) + i * PyArray_STRIDE(ap, 0); + return PyArray_Scalar(item, PyArray_DESCR(ap), (PyObject*) ap); +} + + +PANDAS_INLINE char* +get_c_string(PyObject* obj) { +#if PY_VERSION_HEX >= 0x03000000 + PyObject* enc_str = PyUnicode_AsEncodedString(obj, "utf-8", "error"); + + char *ret; + ret = PyBytes_AS_STRING(enc_str); + + // TODO: memory leak here + + // Py_XDECREF(enc_str); + return ret; +#else + return PyString_AsString(obj); +#endif +} + +PANDAS_INLINE PyObject* +char_to_string(char* data) { +#if PY_VERSION_HEX >= 0x03000000 + return PyUnicode_FromString(data); +#else + return PyString_FromString(data); +#endif +} + +// PANDAS_INLINE int +// is_string(PyObject* obj) { +// #if PY_VERSION_HEX >= 0x03000000 +// return PyUnicode_Check(obj); +// #else +// return PyString_Check(obj); +// #endif + +PANDAS_INLINE PyObject* floatify(PyObject* str) { + +#if PY_VERSION_HEX >= 0x03000000 + return PyFloat_FromString(str); +#else + return PyFloat_FromString(str, NULL); +#endif + +} + + +// PANDAS_INLINE PyObject* +// get_base_ndarray(PyObject* ap) { +// // if (!ap || (NULL == ap)) { +// // Py_RETURN_NONE; +// // } + +// while (!PyArray_CheckExact(ap)) { +// ap = PyArray_BASE((PyArrayObject*) ap); +// if (ap == Py_None) Py_RETURN_NONE; +// } +// // PyArray_BASE is a borrowed reference +// if(ap) { +// Py_INCREF(ap); +// } +// return ap; +// } + diff --git a/pandas/src/offsets.pyx b/pandas/src/offsets.pyx new file mode 100644 index 00000000..68749501 --- /dev/null +++ b/pandas/src/offsets.pyx @@ -0,0 +1,363 @@ + +ctypedef enum time_res: + r_min = 0 + r_microsecond + r_second + r_minute + r_hour + r_day + r_month + r_year + r_max = 98 + r_invalid = 99 + + +cdef conversion_factor(time_res res1, time_res res2): + cdef: + time_res min_res, max_res + int64_t factor + + min_res = min(res1, res2) + max_res = max(res1, res2) + factor = 1 + + if min_res == max_res: + return factor + + while min_res < max_res: + if min_res < r_microsecond: + raise "Cannot convert from less than us" + elif min_res == r_microsecond: + factor *= 1000000 + min_res = r_second + elif min_res == r_second: + factor *= 60 + min_res = r_minute + elif min_res == r_minute: + factor *= 60 + min_res = r_hour + elif min_res == r_hour: + factor *= 24 + min_res = r_day + else: + raise "Cannot convert to month or year" + + return factor + +# Logic to generate ranges +# ----------------------------------------------------------------------------- + +cdef inline int64_t weekend_adjustment(int64_t dow, int bkwd): + if dow > 4: # sat or sun? + if bkwd: # roll back 1 or 2 days + return (4 - dow) + else: # roll forward 2 or 1 days + return (7 - dow) + return 0 + +cdef int64_t us_in_day = conversion_factor(r_microsecond, r_day) + +cdef class _Offset: + """ + Base class to generate timestamps. Set the anchor, and then move offsets + with next & prev. Retrieve timestamp with ts attribute. + """ + cdef: + int64_t t, dow, biz, dayoffset + object start + _TSObject ts + + def __cinit__(self): + self.t=0 + self.dow=0 + self.biz=0 + self.dayoffset=0 + + cpdef anchor(self, object start=None): + if start is not None: + self.start = start + self.ts = convert_to_tsobject(self.start) + self._setup() + + cdef _setup(self): + pass + + cpdef next(self): + pass + + cpdef prev(self): + pass + + cdef int64_t _ts(self): + """ + Access the current timestamp value, with a possible weekday + adjustment. + """ + cdef int64_t adj + + if self.biz != 0: + adj = weekend_adjustment(self.dow, self.biz < 0) + return self.t + us_in_day * adj + else: + return self.t + + cdef int64_t _get_anchor(self): + """ + Retrieve an anchor relating to current offset we're on. + """ + return self.t - self.dayoffset * us_in_day + + property ts: + def __get__(self): + return self._ts() + +cdef class YearOffset(_Offset): + """ + Generate annual timestamps from provided start time; apply dayoffset to + each timestamp. If biz > 0, we choose the next business day at each time; + previous if < 0. + + Parameters + ---------- + dayoffset : int + biz : int + """ + cdef: + int64_t y, ly + + def __init__(self, int64_t dayoffset=0, int64_t biz=0, object anchor=None): + self.dayoffset = dayoffset + self.biz = biz + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + + self.t = ts.value + self.dayoffset * us_in_day + self.y = ts.dts.year + + self.ly = (ts.dts.month > 2 or + ts.dts.month == 2 and ts.dts.day == 29) + + if self.biz != 0: + self.dow = (ts_dayofweek(ts) + self.dayoffset) % 7 + + cpdef next(self): + cdef int64_t days + + days = 365 + is_leapyear(self.y + self.ly) + + self.t += days * us_in_day + self.y += 1 + + if self.biz != 0: + self.dow = (self.dow + days) % 7 + + cpdef prev(self): + cdef int64_t days + + days = 365 + is_leapyear(self.y - (1-self.ly)) + + self.t -= days * us_in_day + self.y -= 1 + + if self.biz != 0: + self.dow = (self.dow - days) % 7 + +cdef class MonthOffset(_Offset): + """ + Generate monthly timestamps from provided start time, and apply dayoffset + to each timestamp. Stride to construct strided timestamps (eg quarterly). + If biz > 0, we choose the next business day at each time; previous if < 0. + + Parameters + ---------- + dayoffset : int + stride : int, > 0 + biz : int + """ + cdef: + Py_ssize_t stride, ly, m + int64_t y + + def __init__(self, int64_t dayoffset=0, Py_ssize_t stride=1, + int64_t biz=0, object anchor=None): + self.dayoffset = dayoffset + self.stride = stride + self.biz = biz + + if stride <= 0: + raise ValueError("Stride must be positive") + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + + self.t = ts.value + (self.dayoffset * us_in_day) + + # for day counting + self.m = ts.dts.month - 1 + self.y = ts.dts.year + self.ly = is_leapyear(self.y) + + if self.biz != 0: + self.dow = (ts_dayofweek(ts) + self.dayoffset) % 7 + + cpdef next(self): + cdef: + int64_t tmp, days + Py_ssize_t j + + days = 0 + for j in range(0, self.stride): + if self.m >= 12: + self.m -= 12 + self.y += 1 + self.ly = is_leapyear(self.y) + days += _days_per_month_table[self.ly][self.m] + self.m += 1 + + self.t += days * us_in_day + + if self.biz != 0: + self.dow = (self.dow + days) % 7 + + cpdef prev(self): + cdef: + int64_t tmp, days + Py_ssize_t j + + days = 0 + for j in range(0, self.stride): + self.m -= 1 + if self.m < 0: + self.m += 12 + self.y -= 1 + self.ly = is_leapyear(self.y) + days += _days_per_month_table[self.ly][self.m] + + self.t -= days * us_in_day + + if self.biz != 0: + self.dow = (self.dow - days) % 7 + +cdef class DayOfMonthOffset(_Offset): + """ + Generate relative monthly timestamps from month & year of provided start + time. For example, fridays of the third week of each month (week=3, day=4); + or, thursdays of the last week of each month (week=-1, day=3). + + Parameters + ---------- + week : int + day : int, 0 to 6 + """ + cdef: + Py_ssize_t ly, m + int64_t y, day, week + + def __init__(self, int64_t week=0, int64_t day=0, object anchor=None): + self.week = week + self.day = day + + if self.day < 0 or self.day > 6: + raise ValueError("Day offset must be 0 to 6") + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + + # rewind to beginning of month + self.t = ts.value - (ts.dts.day - 1) * us_in_day + self.dow = dayofweek(ts.dts.year, ts.dts.month, 1) + + # for day counting + self.m = ts.dts.month - 1 + self.y = ts.dts.year + self.ly = is_leapyear(self.y) + + cpdef next(self): + cdef: + int64_t tmp, days + + days = _days_per_month_table[self.ly][self.m] + self.t += days * us_in_day + self.dow = (self.dow + days) % 7 + + self.m += 1 + if self.m >= 12: + self.m -= 12 + self.y += 1 + self.ly = is_leapyear(self.y) + + cpdef prev(self): + cdef: + int64_t tmp, days + + days = _days_per_month_table[self.ly][(self.m - 1) % 12] + self.t -= days * us_in_day + self.dow = (self.dow - days) % 7 + + self.m -= 1 + if self.m < 0: + self.m += 12 + self.y -= 1 + self.ly = is_leapyear(self.y) + + cdef int64_t _ts(self): + """ + Overwrite default adjustment + """ + cdef int64_t adj = (self.week * 7) + (self.day - self.dow) % 7 + return self.t + us_in_day * adj + +cdef class DayOffset(_Offset): + """ + Generate daily timestamps beginning with first valid time >= start time. If + biz != 0, we skip weekends. Stride, to construct weekly timestamps. + + Parameters + ---------- + stride : int, > 0 + biz : boolean + """ + cdef: + Py_ssize_t stride + + def __init__(self, int64_t stride=1, int64_t biz=0, object anchor=None): + self.stride = stride + self.biz = biz + + if self.stride <= 0: + raise ValueError("Stride must be positive") + + if anchor is not None: + self.anchor(anchor) + + cdef _setup(self): + cdef _TSObject ts = self.ts + self.t = ts.value + if self.biz != 0: + self.dow = ts_dayofweek(ts) + + cpdef next(self): + self.t += (self.stride * us_in_day) + if self.biz != 0: + self.dow = (self.dow + self.stride) % 7 + if self.dow >= 5: + self.t += (7 - self.dow) * us_in_day + self.dow = 0 + + cpdef prev(self): + self.t -= (self.stride * us_in_day) + if self.biz != 0: + self.dow = (self.dow - self.stride) % 7 + if self.dow >= 5: + self.t += (4 - self.dow) * us_in_day + self.dow = 4 diff --git a/pandas/src/period.c b/pandas/src/period.c new file mode 100644 index 00000000..0f51f5c0 --- /dev/null +++ b/pandas/src/period.c @@ -0,0 +1,1371 @@ +#include "period.h" + + +/* + * Borrowed and derived code from scikits.timeseries that we will expose via + * Cython to pandas. This primarily concerns period representation and + * frequency conversion routines. + */ + +/* see end of file for stuff pandas uses (search for 'pandas') */ + +/* ------------------------------------------------------------------ + * Code derived from scikits.timeseries + * ------------------------------------------------------------------*/ + + +static int mod_compat(int x, int m) { + int result = x % m; + if (result < 0) return result + m; + return result; +} + +static int floordiv(int x, int divisor) { + if (x < 0) { + if (mod_compat(x, divisor)) { + return x / divisor - 1; + } + else return x / divisor; + } else { + return x / divisor; + } +} + +static asfreq_info NULL_AF_INFO; + +/* Table with day offsets for each month (0-based, without and with leap) */ +static int month_offset[2][13] = { + { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 }, + { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 } +}; + +/* Table of number of days in a month (0-based, without and with leap) */ +static int days_in_month[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } +}; + +/* Return 1/0 iff year points to a leap year in calendar. */ +static int dInfoCalc_Leapyear(npy_int64 year, int calendar) +{ + if (calendar == GREGORIAN_CALENDAR) { + return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0)); + } else { + return (year % 4 == 0); + } +} + +/* Return the day of the week for the given absolute date. */ +static int dInfoCalc_DayOfWeek(npy_int64 absdate) +{ + int day_of_week; + + if (absdate >= 1) { + day_of_week = (absdate - 1) % 7; + } else { + day_of_week = 6 - ((-absdate) % 7); + } + return day_of_week; +} + +static int monthToQuarter(int month) { return ((month-1)/3)+1; } + +/* Return the year offset, that is the absolute date of the day + 31.12.(year-1) in the given calendar. + + Note: + For the Julian calendar we shift the absdate (which is measured + using the Gregorian Epoch) value by two days because the Epoch + (0001-01-01) in the Julian calendar lies 2 days before the Epoch in + the Gregorian calendar. */ +static int dInfoCalc_YearOffset(npy_int64 year, int calendar) +{ + year--; + if (calendar == GREGORIAN_CALENDAR) { + if (year >= 0 || -1/4 == -1) + return year*365 + year/4 - year/100 + year/400; + else + return year*365 + (year-3)/4 - (year-99)/100 + (year-399)/400; + } + else if (calendar == JULIAN_CALENDAR) { + if (year >= 0 || -1/4 == -1) + return year*365 + year/4 - 2; + else + return year*365 + (year-3)/4 - 2; + } + Py_Error(PyExc_ValueError, "unknown calendar"); + onError: + return INT_ERR_CODE; +} + +/* Set the instance's value using the given date and time. calendar may be set + * to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to indicate the calendar + * to be used. */ + +static int dInfoCalc_SetFromDateAndTime(struct date_info *dinfo, + int year, int month, int day, int hour, int minute, double second, + int calendar) +{ + + /* Calculate the absolute date */ + { + int leap; + npy_int64 absdate; + int yearoffset; + + /* Range check */ + Py_AssertWithArg(year > -(INT_MAX / 366) && year < (INT_MAX / 366), + PyExc_ValueError, + "year out of range: %i", + year); + + /* Is it a leap year ? */ + leap = dInfoCalc_Leapyear(year, calendar); + + /* Negative month values indicate months relative to the years end */ + if (month < 0) month += 13; + Py_AssertWithArg(month >= 1 && month <= 12, + PyExc_ValueError, + "month out of range (1-12): %i", + month); + + /* Negative values indicate days relative to the months end */ + if (day < 0) day += days_in_month[leap][month - 1] + 1; + Py_AssertWithArg(day >= 1 && day <= days_in_month[leap][month - 1], + PyExc_ValueError, + "day out of range: %i", + day); + + yearoffset = dInfoCalc_YearOffset(year, calendar); + if (PyErr_Occurred()) goto onError; + + absdate = day + month_offset[leap][month - 1] + yearoffset; + + dinfo->absdate = absdate; + + dinfo->year = year; + dinfo->month = month; + dinfo->quarter = ((month-1)/3)+1; + dinfo->day = day; + + dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); + dinfo->day_of_year = (short)(absdate - yearoffset); + + dinfo->calendar = calendar; + } + + /* Calculate the absolute time */ + { + Py_AssertWithArg(hour >= 0 && hour <= 23, + PyExc_ValueError, + "hour out of range (0-23): %i", + hour); + Py_AssertWithArg(minute >= 0 && minute <= 59, + PyExc_ValueError, + "minute out of range (0-59): %i", + minute); + Py_AssertWithArg(second >= (double)0.0 && + (second < (double)60.0 || + (hour == 23 && minute == 59 && + second < (double)61.0)), + PyExc_ValueError, + "second out of range (0.0 - <60.0; <61.0 for 23:59): %f", + second); + + dinfo->abstime = (double)(hour*3600 + minute*60) + second; + + dinfo->hour = hour; + dinfo->minute = minute; + dinfo->second = second; + } + return 0; + + onError: + return INT_ERR_CODE; +} + +/* Sets the date part of the date_info struct using the indicated + calendar. + + XXX This could also be done using some integer arithmetics rather + than with this iterative approach... */ +static +int dInfoCalc_SetFromAbsDate(register struct date_info *dinfo, + npy_int64 absdate, int calendar) +{ + register npy_int64 year; + npy_int64 yearoffset; + int leap,dayoffset; + int *monthoffset; + + /* Approximate year */ + if (calendar == GREGORIAN_CALENDAR) { + year = (npy_int64)(((double)absdate) / 365.2425); + } else if (calendar == JULIAN_CALENDAR) { + year = (npy_int64)(((double)absdate) / 365.25); + } else { + Py_Error(PyExc_ValueError, "unknown calendar"); + } + if (absdate > 0) year++; + + /* Apply corrections to reach the correct year */ + while (1) { + /* Calculate the year offset */ + yearoffset = dInfoCalc_YearOffset(year, calendar); + if (PyErr_Occurred()) + goto onError; + + /* Backward correction: absdate must be greater than the + yearoffset */ + if (yearoffset >= absdate) { + year--; + continue; + } + + dayoffset = absdate - yearoffset; + leap = dInfoCalc_Leapyear(year,calendar); + + /* Forward correction: non leap years only have 365 days */ + if (dayoffset > 365 && !leap) { + year++; + continue; + } + break; + } + + dinfo->year = year; + dinfo->calendar = calendar; + + /* Now iterate to find the month */ + monthoffset = month_offset[leap]; + { + register int month; + + for (month = 1; month < 13; month++) { + if (monthoffset[month] >= dayoffset) + break; + } + + dinfo->month = month; + dinfo->quarter = monthToQuarter(month); + dinfo->day = dayoffset - month_offset[leap][month-1]; + } + + + dinfo->day_of_week = dInfoCalc_DayOfWeek(absdate); + dinfo->day_of_year = dayoffset; + dinfo->absdate = absdate; + + return 0; + + onError: + return INT_ERR_CODE; +} + +/////////////////////////////////////////////// + +// frequency specifc conversion routines +// each function must take an integer fromDate and +// a char relation ('S' or 'E' for 'START' or 'END') +/////////////////////////////////////////////////////////////////////// + +// helpers for frequency conversion routines // + +static npy_int64 DtoB_weekday(npy_int64 absdate) { + return (((absdate) / 7) * 5) + (absdate) % 7 - BDAY_OFFSET; +} + +static npy_int64 DtoB_WeekendToMonday(npy_int64 absdate, int day_of_week) { + if (day_of_week > 4) { + //change to Monday after weekend + absdate += (7 - day_of_week); + } + return DtoB_weekday(absdate); +} + +static npy_int64 DtoB_WeekendToFriday(npy_int64 absdate, int day_of_week) { + if (day_of_week > 4) { + //change to friday before weekend + absdate -= (day_of_week - 4); + } + return DtoB_weekday(absdate); +} + +static npy_int64 absdate_from_ymd(int y, int m, int d) { + struct date_info tempDate; + if (dInfoCalc_SetFromDateAndTime(&tempDate, y, m, d, 0, 0, 0, GREGORIAN_CALENDAR)) { + return INT_ERR_CODE; + } + return tempDate.absdate; +} + +//************ FROM DAILY *************** + +static npy_int64 asfreq_DtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (dinfo.month > af_info->to_a_year_end) { + return (npy_int64)(dinfo.year + 1 - BASE_YEAR); + } + else { + return (npy_int64)(dinfo.year - BASE_YEAR); + } +} + +static npy_int64 DtoQ_yq(npy_int64 ordinal, asfreq_info *af_info, + int *year, int *quarter) { + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + if (af_info->to_q_year_end != 12) { + dinfo.month -= af_info->to_q_year_end; + if (dinfo.month <= 0) { dinfo.month += 12; } + else { dinfo.year += 1; } + dinfo.quarter = monthToQuarter(dinfo.month); + } + + *year = dinfo.year; + *quarter = dinfo.quarter; + + return 0; +} + + +static npy_int64 asfreq_DtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + int year, quarter; + + if (DtoQ_yq(ordinal, af_info, &year, &quarter) == INT_ERR_CODE) { + return INT_ERR_CODE; + } + + return (npy_int64)((year - BASE_YEAR) * 4 + quarter - 1); +} + +static npy_int64 asfreq_DtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + return (npy_int64)((dinfo.year - BASE_YEAR) * 12 + dinfo.month - 1); +} + +static npy_int64 asfreq_DtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return (ordinal + ORD_OFFSET - (1 + af_info->to_week_end))/7 + 1 - WEEK_OFFSET; +} + +static npy_int64 asfreq_DtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, ordinal + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } else { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } +} + +// needed for getDateInfo function +static npy_int64 asfreq_DtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { return ordinal; } + +static npy_int64 asfreq_DtoHIGHFREQ(npy_int64 ordinal, char relation, npy_int64 per_day) { + if (relation == 'S') { + return ordinal * per_day; + } + else { + return (ordinal+ 1) * per_day - 1; + } +} + +static npy_int64 asfreq_DtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoHIGHFREQ(ordinal, relation, 24); } +static npy_int64 asfreq_DtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoHIGHFREQ(ordinal, relation, 24*60); } +static npy_int64 asfreq_DtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoHIGHFREQ(ordinal, relation, 24*60*60); } + +//************ FROM SECONDLY *************** + +static npy_int64 asfreq_StoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return (ordinal)/(60*60*24); } + +static npy_int64 asfreq_StoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_StoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_StoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_StoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_StoB(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoB(asfreq_StoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + + +static npy_int64 asfreq_StoT(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal / 60; +} + +static npy_int64 asfreq_StoH(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal / (60*60); +} + +//************ FROM MINUTELY *************** + +static npy_int64 asfreq_TtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return (ordinal)/(60*24); } + +static npy_int64 asfreq_TtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_TtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_TtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_TtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_TtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoB(asfreq_TtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_TtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return ordinal / 60; +} + +static npy_int64 asfreq_TtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) { + if (relation == 'S') { + return ordinal*60; } + else { + return ordinal*60 + 59; + } +} + +//************ FROM HOURLY *************** + +static npy_int64 asfreq_HtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return ordinal / 24; } +static npy_int64 asfreq_HtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_HtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_HtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_HtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } +static npy_int64 asfreq_HtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoB(asfreq_HtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +// calculation works out the same as TtoS, so we just call that function for HtoT +static npy_int64 asfreq_HtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_TtoS(ordinal, relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_HtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) { + if (relation == 'S') { + return ordinal*60*60; + } + else { + return (ordinal + 1)*60*60 - 1; + } +} + +//************ FROM BUSINESS *************** + +static npy_int64 asfreq_BtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) + { + ordinal += BDAY_OFFSET; + return (((ordinal - 1) / 5) * 7 + + mod_compat(ordinal - 1, 5) + 1 - ORD_OFFSET); + } + +static npy_int64 asfreq_BtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_BtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_BtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_BtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_BtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_BtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_BtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_BtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +//************ FROM WEEKLY *************** + +static npy_int64 asfreq_WtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + ordinal += WEEK_OFFSET; + if (relation == 'S') { + return ordinal * 7 - 6 + af_info->from_week_end - ORD_OFFSET; + } + else { + return ordinal * 7 + af_info->from_week_end - ORD_OFFSET; + } +} + +static npy_int64 asfreq_WtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoA(asfreq_WtoD(ordinal, 'E', af_info), relation, af_info); } +static npy_int64 asfreq_WtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoQ(asfreq_WtoD(ordinal, 'E', af_info), relation, af_info); } +static npy_int64 asfreq_WtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoM(asfreq_WtoD(ordinal, 'E', af_info), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_WtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_WtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_WtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_WtoD(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { + return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); + } + else { + return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); + } +} + +static npy_int64 asfreq_WtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_WtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_WtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_WtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_WtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_WtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } + +//************ FROM MONTHLY *************** + +static void MtoD_ym(npy_int64 ordinal, int *y, int *m) { + *y = ordinal / 12 + BASE_YEAR; + *m = mod_compat(ordinal, 12) + 1; +} + + +static npy_int64 asfreq_MtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + npy_int64 absdate; + int y, m; + + if (relation == 'S') { + MtoD_ym(ordinal, &y, &m); + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + return absdate - ORD_OFFSET; + } else { + MtoD_ym(ordinal + 1, &y, &m); + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + return absdate - 1 - ORD_OFFSET; + } +} + +static npy_int64 asfreq_MtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoA(asfreq_MtoD(ordinal, 'E', &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_MtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoQ(asfreq_MtoD(ordinal, 'E', &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_MtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, af_info); } + +static npy_int64 asfreq_MtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_MtoD(ordinal, relation, &NULL_AF_INFO) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } + else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } +} + +static npy_int64 asfreq_MtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_MtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_MtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_MtoD(ordinal, relation, &NULL_AF_INFO), relation, &NULL_AF_INFO); } + +//************ FROM QUARTERLY *************** + +static void QtoD_ym(npy_int64 ordinal, int *y, int *m, asfreq_info *af_info) { + *y = floordiv(ordinal, 4) + BASE_YEAR; + *m = mod_compat(ordinal, 4) * 3 + 1; + + if (af_info->from_q_year_end != 12) { + *m += af_info->from_q_year_end; + if (*m > 12) { *m -= 12; } + else { *y -= 1; } + } +} + +static npy_int64 asfreq_QtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + npy_int64 absdate; + int y, m; + + if (relation == 'S') { + QtoD_ym(ordinal, &y, &m, af_info); + // printf("ordinal: %d, year: %d, month: %d\n", (int) ordinal, y, m); + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + return absdate - ORD_OFFSET; + } else { + QtoD_ym(ordinal+1, &y, &m, af_info); + /* printf("ordinal: %d, year: %d, month: %d\n", (int) ordinal, y, m); */ + if ((absdate = absdate_from_ymd(y, m, 1)) == INT_ERR_CODE) return INT_ERR_CODE; + return absdate - 1 - ORD_OFFSET; + } +} + +static npy_int64 asfreq_QtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_QtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_QtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoA(asfreq_QtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_QtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) { + return asfreq_DtoM(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } + +static npy_int64 asfreq_QtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_QtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_QtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_QtoD(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } + else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } +} + + +static npy_int64 asfreq_QtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_QtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_QtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_QtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } + + +//************ FROM ANNUAL *************** + +static npy_int64 asfreq_AtoD(npy_int64 ordinal, char relation, asfreq_info *af_info) { + npy_int64 absdate, final_adj; + int year; + int month = (af_info->from_a_year_end) % 12; + + // start from 1970 + ordinal += BASE_YEAR; + + if (month == 0) { month = 1; } + else { month += 1; } + + if (relation == 'S') { + if (af_info->from_a_year_end == 12) {year = ordinal;} + else {year = ordinal - 1;} + final_adj = 0; + } else { + if (af_info->from_a_year_end == 12) {year = ordinal+1;} + else {year = ordinal;} + final_adj = -1; + } + absdate = absdate_from_ymd(year, month, 1); + if (absdate == INT_ERR_CODE) { + return INT_ERR_CODE; + } + return absdate + final_adj - ORD_OFFSET; +} + +static npy_int64 asfreq_AtoA(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoA(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_AtoQ(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoQ(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_AtoM(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoM(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_AtoW(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoW(asfreq_AtoD(ordinal, relation, af_info), relation, af_info); } + +static npy_int64 asfreq_AtoB(npy_int64 ordinal, char relation, asfreq_info *af_info) { + + struct date_info dinfo; + if (dInfoCalc_SetFromAbsDate(&dinfo, + asfreq_AtoD(ordinal, relation, af_info) + ORD_OFFSET, + GREGORIAN_CALENDAR)) return INT_ERR_CODE; + + if (relation == 'S') { return DtoB_WeekendToMonday(dinfo.absdate, dinfo.day_of_week); } + else { return DtoB_WeekendToFriday(dinfo.absdate, dinfo.day_of_week); } +} + +static npy_int64 asfreq_AtoH(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoH(asfreq_AtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_AtoT(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoT(asfreq_AtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } +static npy_int64 asfreq_AtoS(npy_int64 ordinal, char relation, asfreq_info *af_info) + { return asfreq_DtoS(asfreq_AtoD(ordinal, relation, af_info), relation, &NULL_AF_INFO); } + +static npy_int64 nofunc(npy_int64 ordinal, char relation, asfreq_info *af_info) { return INT_ERR_CODE; } +static npy_int64 no_op(npy_int64 ordinal, char relation, asfreq_info *af_info) { return ordinal; } + +// end of frequency specific conversion routines + +static int get_freq_group(int freq) { return (freq/1000)*1000; } + +static int calc_a_year_end(int freq, int group) { + int result = (freq - group) % 12; + if (result == 0) {return 12;} + else {return result;} +} + +static int calc_week_end(int freq, int group) { + return freq - group; +} + +void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) { + int fromGroup = get_freq_group(fromFreq); + int toGroup = get_freq_group(toFreq); + + switch(fromGroup) + { + case FR_WK: { + af_info->from_week_end = calc_week_end(fromFreq, fromGroup); + } break; + case FR_ANN: { + af_info->from_a_year_end = calc_a_year_end(fromFreq, fromGroup); + } break; + case FR_QTR: { + af_info->from_q_year_end = calc_a_year_end(fromFreq, fromGroup); + } break; + } + + switch(toGroup) + { + case FR_WK: { + af_info->to_week_end = calc_week_end(toFreq, toGroup); + } break; + case FR_ANN: { + af_info->to_a_year_end = calc_a_year_end(toFreq, toGroup); + } break; + case FR_QTR: { + af_info->to_q_year_end = calc_a_year_end(toFreq, toGroup); + } break; + } +} + + +freq_conv_func get_asfreq_func(int fromFreq, int toFreq) +{ + int fromGroup = get_freq_group(fromFreq); + int toGroup = get_freq_group(toFreq); + + if (fromGroup == FR_UND) { fromGroup = FR_DAY; } + + switch(fromGroup) + { + case FR_ANN: + switch(toGroup) + { + case FR_ANN: return &asfreq_AtoA; + case FR_QTR: return &asfreq_AtoQ; + case FR_MTH: return &asfreq_AtoM; + case FR_WK: return &asfreq_AtoW; + case FR_BUS: return &asfreq_AtoB; + case FR_DAY: return &asfreq_AtoD; + case FR_HR: return &asfreq_AtoH; + case FR_MIN: return &asfreq_AtoT; + case FR_SEC: return &asfreq_AtoS; + default: return &nofunc; + } + + case FR_QTR: + switch(toGroup) + { + case FR_ANN: return &asfreq_QtoA; + case FR_QTR: return &asfreq_QtoQ; + case FR_MTH: return &asfreq_QtoM; + case FR_WK: return &asfreq_QtoW; + case FR_BUS: return &asfreq_QtoB; + case FR_DAY: return &asfreq_QtoD; + case FR_HR: return &asfreq_QtoH; + case FR_MIN: return &asfreq_QtoT; + case FR_SEC: return &asfreq_QtoS; + default: return &nofunc; + } + + case FR_MTH: + switch(toGroup) + { + case FR_ANN: return &asfreq_MtoA; + case FR_QTR: return &asfreq_MtoQ; + case FR_MTH: return &no_op; + case FR_WK: return &asfreq_MtoW; + case FR_BUS: return &asfreq_MtoB; + case FR_DAY: return &asfreq_MtoD; + case FR_HR: return &asfreq_MtoH; + case FR_MIN: return &asfreq_MtoT; + case FR_SEC: return &asfreq_MtoS; + default: return &nofunc; + } + + case FR_WK: + switch(toGroup) + { + case FR_ANN: return &asfreq_WtoA; + case FR_QTR: return &asfreq_WtoQ; + case FR_MTH: return &asfreq_WtoM; + case FR_WK: return &asfreq_WtoW; + case FR_BUS: return &asfreq_WtoB; + case FR_DAY: return &asfreq_WtoD; + case FR_HR: return &asfreq_WtoH; + case FR_MIN: return &asfreq_WtoT; + case FR_SEC: return &asfreq_WtoS; + default: return &nofunc; + } + + case FR_BUS: + switch(toGroup) + { + case FR_ANN: return &asfreq_BtoA; + case FR_QTR: return &asfreq_BtoQ; + case FR_MTH: return &asfreq_BtoM; + case FR_WK: return &asfreq_BtoW; + case FR_DAY: return &asfreq_BtoD; + case FR_BUS: return &no_op; + case FR_HR: return &asfreq_BtoH; + case FR_MIN: return &asfreq_BtoT; + case FR_SEC: return &asfreq_BtoS; + default: return &nofunc; + } + + case FR_DAY: + switch(toGroup) + { + case FR_ANN: return &asfreq_DtoA; + case FR_QTR: return &asfreq_DtoQ; + case FR_MTH: return &asfreq_DtoM; + case FR_WK: return &asfreq_DtoW; + case FR_BUS: return &asfreq_DtoB; + case FR_DAY: return &asfreq_DtoD; + case FR_HR: return &asfreq_DtoH; + case FR_MIN: return &asfreq_DtoT; + case FR_SEC: return &asfreq_DtoS; + default: return &nofunc; + } + + case FR_HR: + switch(toGroup) + { + case FR_ANN: return &asfreq_HtoA; + case FR_QTR: return &asfreq_HtoQ; + case FR_MTH: return &asfreq_HtoM; + case FR_WK: return &asfreq_HtoW; + case FR_BUS: return &asfreq_HtoB; + case FR_DAY: return &asfreq_HtoD; + case FR_HR: return &no_op; + case FR_MIN: return &asfreq_HtoT; + case FR_SEC: return &asfreq_HtoS; + default: return &nofunc; + } + + case FR_MIN: + switch(toGroup) + { + case FR_ANN: return &asfreq_TtoA; + case FR_QTR: return &asfreq_TtoQ; + case FR_MTH: return &asfreq_TtoM; + case FR_WK: return &asfreq_TtoW; + case FR_BUS: return &asfreq_TtoB; + case FR_DAY: return &asfreq_TtoD; + case FR_HR: return &asfreq_TtoH; + case FR_MIN: return &no_op; + case FR_SEC: return &asfreq_TtoS; + default: return &nofunc; + } + + case FR_SEC: + switch(toGroup) + { + case FR_ANN: return &asfreq_StoA; + case FR_QTR: return &asfreq_StoQ; + case FR_MTH: return &asfreq_StoM; + case FR_WK: return &asfreq_StoW; + case FR_BUS: return &asfreq_StoB; + case FR_DAY: return &asfreq_StoD; + case FR_HR: return &asfreq_StoH; + case FR_MIN: return &asfreq_StoT; + case FR_SEC: return &no_op; + default: return &nofunc; + } + default: return &nofunc; + } +} + +double get_abs_time(int freq, npy_int64 daily_ord, npy_int64 ordinal) { + + npy_int64 start_ord, per_day, unit; + switch(freq) + { + case FR_HR: + per_day = 24; + unit = 60 * 60; + break; + case FR_MIN: + per_day = 24*60; + unit = 60; + break; + case FR_SEC: + per_day = 24*60*60; + unit = 1; + break; + default: + return 0; // 24*60*60 - 1; + } + + start_ord = asfreq_DtoHIGHFREQ(daily_ord, 'S', per_day); + /* printf("start_ord: %d\n", start_ord); */ + return (double) ( unit * (ordinal - start_ord)); + /* if (ordinal >= 0) { */ + /* } */ + /* else { */ + /* return (double) (unit * mod_compat(ordinal - start_ord, per_day)); */ + /* } */ +} + +/* Sets the time part of the DateTime object. */ +static +int dInfoCalc_SetFromAbsTime(struct date_info *dinfo, + double abstime) +{ + int inttime; + int hour,minute; + double second; + + inttime = (int)abstime; + hour = inttime / 3600; + minute = (inttime % 3600) / 60; + second = abstime - (double)(hour*3600 + minute*60); + + dinfo->hour = hour; + dinfo->minute = minute; + dinfo->second = second; + + dinfo->abstime = abstime; + + return 0; +} + +/* Set the instance's value using the given date and time. calendar + may be set to the flags: GREGORIAN_CALENDAR, JULIAN_CALENDAR to + indicate the calendar to be used. */ +static +int dInfoCalc_SetFromAbsDateTime(struct date_info *dinfo, + npy_int64 absdate, + double abstime, + int calendar) +{ + + /* Bounds check */ + Py_AssertWithArg(abstime >= 0.0 && abstime <= SECONDS_PER_DAY, + PyExc_ValueError, + "abstime out of range (0.0 - 86400.0): %f", + abstime); + + /* Calculate the date */ + if (dInfoCalc_SetFromAbsDate(dinfo, absdate, calendar)) goto onError; + + /* Calculate the time */ + if (dInfoCalc_SetFromAbsTime(dinfo, abstime)) goto onError; + + return 0; + onError: + return INT_ERR_CODE; +} + +/* ------------------------------------------------------------------ + * New pandas API-helper code, to expose to cython + * ------------------------------------------------------------------*/ + +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation) +{ + npy_int64 val; + freq_conv_func func; + asfreq_info finfo; + + func = get_asfreq_func(freq1, freq2); + get_asfreq_info(freq1, freq2, &finfo); + + val = (*func)(period_ordinal, relation, &finfo); + + if (val == INT_ERR_CODE) { + // Py_Error(PyExc_ValueError, "Unable to convert to desired frequency."); + goto onError; + } + return val; +onError: + return INT_ERR_CODE; +} + + +/* generate an ordinal in period space */ +npy_int64 get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, + int freq) +{ + npy_int64 absdays, delta; + npy_int64 weeks, days; + npy_int64 adj_ordinal, ordinal, day_adj; + int freq_group, fmonth, mdiff, quarter; + freq_group = get_freq_group(freq); + + if (freq == FR_SEC) { + absdays = absdate_from_ymd(year, month, day); + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*86400 + hour*3600 + minute*60 + second); + } + + if (freq == FR_MIN) { + absdays = absdate_from_ymd(year, month, day); + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*1440 + hour*60 + minute); + } + + if (freq == FR_HR) { + if ((absdays = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + { + goto onError; + } + delta = (absdays - ORD_OFFSET); + return (npy_int64)(delta*24 + hour); + } + + if (freq == FR_DAY) + { + return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); + } + + if (freq == FR_UND) + { + return (npy_int64) (absdate_from_ymd(year, month, day) - ORD_OFFSET); + } + + if (freq == FR_BUS) + { + if((days = absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + { + goto onError; + } + weeks = days / 7; + return (npy_int64)(days - weeks * 2) - BDAY_OFFSET; + } + + if (freq_group == FR_WK) + { + if((ordinal = (npy_int64)absdate_from_ymd(year, month, day)) == INT_ERR_CODE) + { + goto onError; + } + day_adj = freq - FR_WK; + return (ordinal - (1 + day_adj)) / 7 + 1 - WEEK_OFFSET; + } + + if (freq == FR_MTH) + { + return (year - BASE_YEAR) * 12 + month - 1; + } + + if (freq_group == FR_QTR) + { + fmonth = freq - FR_QTR; + if (fmonth == 0) fmonth = 12; + + mdiff = month - fmonth; + if (mdiff < 0) mdiff += 12; + if (month >= fmonth) mdiff += 12; + + return (year - BASE_YEAR) * 4 + (mdiff - 1) / 3; + } + + if (freq_group == FR_ANN) + { + fmonth = freq - FR_ANN; + if (fmonth == 0) fmonth = 12; + if (month <= fmonth) { + return year - BASE_YEAR; + } + else { + return year - BASE_YEAR + 1; + } + } + + Py_Error(PyExc_RuntimeError, "Unable to generate frequency ordinal"); + +onError: + return INT_ERR_CODE; +} + +/* + Returns the proleptic Gregorian ordinal of the date, as an integer. + This corresponds to the number of days since Jan., 1st, 1AD. + When the instance has a frequency less than daily, the proleptic date + is calculated for the last day of the period. +*/ + +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq) +{ + asfreq_info af_info; + npy_int64 (*toDaily)(npy_int64, char, asfreq_info*); + + if (freq == FR_DAY) + return period_ordinal + ORD_OFFSET; + + toDaily = get_asfreq_func(freq, FR_DAY); + get_asfreq_info(freq, FR_DAY, &af_info); + return toDaily(period_ordinal, 'E', &af_info) + ORD_OFFSET; +} + +char *str_replace(const char *s, const char *old, const char *new) { + char *ret; + int i, count = 0; + size_t newlen = strlen(new); + size_t oldlen = strlen(old); + + for (i = 0; s[i] != '\0'; i++) { + if (strstr(&s[i], old) == &s[i]) { + count++; + i += oldlen - 1; + } + } + + ret = PyArray_malloc(i + 1 + count * (newlen - oldlen)); + if (ret == NULL) {return (char *)PyErr_NoMemory();} + + i = 0; + while (*s) { + if (strstr(s, old) == s) { + strcpy(&ret[i], new); + i += newlen; + s += oldlen; + } else { + ret[i++] = *s++; + } + } + ret[i] = '\0'; + + return ret; +} + +// function to generate a nice string representation of the period +// object, originally from DateObject_strftime + +char* c_strftime(struct date_info *tmp, char *fmt) { + struct tm c_date; + char* result; + struct date_info dinfo = *tmp; + int result_len = strlen(fmt) + 50; + + c_date.tm_sec = (int)dinfo.second; + c_date.tm_min = dinfo.minute; + c_date.tm_hour = dinfo.hour; + c_date.tm_mday = dinfo.day; + c_date.tm_mon = dinfo.month - 1; + c_date.tm_year = dinfo.year - 1900; + c_date.tm_wday = (dinfo.day_of_week + 1) % 7; + c_date.tm_yday = dinfo.day_of_year - 1; + c_date.tm_isdst = -1; + + result = malloc(result_len * sizeof(char)); + + strftime(result, result_len, fmt, &c_date); + + return result; +} + +int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year) { + asfreq_info af_info; + int qtr_freq; + npy_int64 daily_ord; + npy_int64 (*toDaily)(npy_int64, char, asfreq_info*) = NULL; + + toDaily = get_asfreq_func(freq, FR_DAY); + get_asfreq_info(freq, FR_DAY, &af_info); + + daily_ord = toDaily(ordinal, 'E', &af_info); + + if (get_freq_group(freq) == FR_QTR) { + qtr_freq = freq; + } else { qtr_freq = FR_QTR; } + get_asfreq_info(FR_DAY, qtr_freq, &af_info); + + if(DtoQ_yq(daily_ord, &af_info, year, quarter) == INT_ERR_CODE) + return -1; + + return 0; +} + + + + + +static int _quarter_year(npy_int64 ordinal, int freq, int *year, int *quarter) { + asfreq_info af_info; + int qtr_freq; + + ordinal = get_python_ordinal(ordinal, freq) - ORD_OFFSET; + + if (get_freq_group(freq) == FR_QTR) + qtr_freq = freq; + else + qtr_freq = FR_QTR; + + get_asfreq_info(FR_DAY, qtr_freq, &af_info); + + if (DtoQ_yq(ordinal, &af_info, year, quarter) == INT_ERR_CODE) + return INT_ERR_CODE; + + if ((qtr_freq % 1000) > 12) + *year -= 1; + + return 0; +} + +static int _ISOWeek(struct date_info *dinfo) +{ + int week; + + /* Estimate */ + week = (dinfo->day_of_year-1) - dinfo->day_of_week + 3; + if (week >= 0) week = week / 7 + 1; + + /* Verify */ + if (week < 0) { + /* The day lies in last week of the previous year */ + if ((week > -2) || + (week == -2 && dInfoCalc_Leapyear(dinfo->year-1, dinfo->calendar))) + week = 53; + else + week = 52; + } else if (week == 53) { + /* Check if the week belongs to year or year+1 */ + if (31-dinfo->day + dinfo->day_of_week < 3) { + week = 1; + } + } + + return week; +} + +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo) +{ + npy_int64 absdate = get_python_ordinal(ordinal, freq); + /* printf("freq: %d, absdate: %d\n", freq, (int) absdate); */ + double abstime = get_abs_time(freq, absdate - ORD_OFFSET, ordinal); + if (abstime < 0) { + abstime += 86400; + absdate -= 1; + } + + if(dInfoCalc_SetFromAbsDateTime(dinfo, absdate, + abstime, GREGORIAN_CALENDAR)) + return INT_ERR_CODE; + + return 0; +} + +int pyear(npy_int64 ordinal, int freq) { + struct date_info dinfo; + get_date_info(ordinal, freq, &dinfo); + return dinfo.year; +} + +int pqyear(npy_int64 ordinal, int freq) { + int year, quarter; + if( _quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) + return INT_ERR_CODE; + return year; +} + +int pquarter(npy_int64 ordinal, int freq) { + int year, quarter; + if(_quarter_year(ordinal, freq, &year, &quarter) == INT_ERR_CODE) + return INT_ERR_CODE; + return quarter; +} + +int pmonth(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.month; +} + +int pday(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day; +} + +int pweekday(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day_of_week; +} + +int pday_of_week(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day_of_week; +} + +int pday_of_year(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.day_of_year; +} + +int pweek(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return _ISOWeek(&dinfo); +} + +int phour(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.hour; +} + +int pminute(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return dinfo.minute; +} + +int psecond(npy_int64 ordinal, int freq) { + struct date_info dinfo; + if(get_date_info(ordinal, freq, &dinfo) == INT_ERR_CODE) + return INT_ERR_CODE; + return (int)dinfo.second; +} diff --git a/pandas/src/period.h b/pandas/src/period.h new file mode 100644 index 00000000..1c58eaed --- /dev/null +++ b/pandas/src/period.h @@ -0,0 +1,159 @@ +/* + * Borrowed and derived code from scikits.timeseries that we will expose via + * Cython to pandas. This primarily concerns interval representation and + * frequency conversion routines. + */ + +#ifndef C_PERIOD_H +#define C_PERIOD_H + +#include +#include "numpy/ndarraytypes.h" +#include "stdint.h" +#include "limits.h" + +/* + * declarations from period here + */ + +#define GREGORIAN_CALENDAR 0 +#define JULIAN_CALENDAR 1 + +#define SECONDS_PER_DAY ((double) 86400.0) + +#define Py_AssertWithArg(x,errortype,errorstr,a1) {if (!(x)) {PyErr_Format(errortype,errorstr,a1);goto onError;}} +#define Py_Error(errortype,errorstr) {PyErr_SetString(errortype,errorstr);goto onError;} + +/*** FREQUENCY CONSTANTS ***/ + +// HIGHFREQ_ORIG is the datetime ordinal from which to begin the second +// frequency ordinal sequence + +// typedef int64_t npy_int64; +// begins second ordinal at 1/1/1970 unix epoch + +// #define HIGHFREQ_ORIG 62135683200LL +#define BASE_YEAR 1970 +#define ORD_OFFSET 719163LL // days until 1970-01-01 +#define BDAY_OFFSET 513689LL // days until 1970-01-01 +#define WEEK_OFFSET 102737LL +#define HIGHFREQ_ORIG 0 // ORD_OFFSET * 86400LL // days until 1970-01-01 + +#define FR_ANN 1000 /* Annual */ +#define FR_ANNDEC FR_ANN /* Annual - December year end*/ +#define FR_ANNJAN 1001 /* Annual - January year end*/ +#define FR_ANNFEB 1002 /* Annual - February year end*/ +#define FR_ANNMAR 1003 /* Annual - March year end*/ +#define FR_ANNAPR 1004 /* Annual - April year end*/ +#define FR_ANNMAY 1005 /* Annual - May year end*/ +#define FR_ANNJUN 1006 /* Annual - June year end*/ +#define FR_ANNJUL 1007 /* Annual - July year end*/ +#define FR_ANNAUG 1008 /* Annual - August year end*/ +#define FR_ANNSEP 1009 /* Annual - September year end*/ +#define FR_ANNOCT 1010 /* Annual - October year end*/ +#define FR_ANNNOV 1011 /* Annual - November year end*/ + +/* The standard quarterly frequencies with various fiscal year ends + eg, Q42005 for Q@OCT runs Aug 1, 2005 to Oct 31, 2005 */ +#define FR_QTR 2000 /* Quarterly - December year end (default quarterly) */ +#define FR_QTRDEC FR_QTR /* Quarterly - December year end */ +#define FR_QTRJAN 2001 /* Quarterly - January year end */ +#define FR_QTRFEB 2002 /* Quarterly - February year end */ +#define FR_QTRMAR 2003 /* Quarterly - March year end */ +#define FR_QTRAPR 2004 /* Quarterly - April year end */ +#define FR_QTRMAY 2005 /* Quarterly - May year end */ +#define FR_QTRJUN 2006 /* Quarterly - June year end */ +#define FR_QTRJUL 2007 /* Quarterly - July year end */ +#define FR_QTRAUG 2008 /* Quarterly - August year end */ +#define FR_QTRSEP 2009 /* Quarterly - September year end */ +#define FR_QTROCT 2010 /* Quarterly - October year end */ +#define FR_QTRNOV 2011 /* Quarterly - November year end */ + +#define FR_MTH 3000 /* Monthly */ + +#define FR_WK 4000 /* Weekly */ +#define FR_WKSUN FR_WK /* Weekly - Sunday end of week */ +#define FR_WKMON 4001 /* Weekly - Monday end of week */ +#define FR_WKTUE 4002 /* Weekly - Tuesday end of week */ +#define FR_WKWED 4003 /* Weekly - Wednesday end of week */ +#define FR_WKTHU 4004 /* Weekly - Thursday end of week */ +#define FR_WKFRI 4005 /* Weekly - Friday end of week */ +#define FR_WKSAT 4006 /* Weekly - Saturday end of week */ + +#define FR_BUS 5000 /* Business days */ +#define FR_DAY 6000 /* Daily */ +#define FR_HR 7000 /* Hourly */ +#define FR_MIN 8000 /* Minutely */ +#define FR_SEC 9000 /* Secondly */ + +#define FR_UND -10000 /* Undefined */ + +#define INT_ERR_CODE INT32_MIN + +#define MEM_CHECK(item) if (item == NULL) { return PyErr_NoMemory(); } +#define ERR_CHECK(item) if (item == NULL) { return NULL; } + +typedef struct asfreq_info { + int from_week_end; // day the week ends on in the "from" frequency + int to_week_end; // day the week ends on in the "to" frequency + + int from_a_year_end; // month the year ends on in the "from" frequency + int to_a_year_end; // month the year ends on in the "to" frequency + + int from_q_year_end; // month the year ends on in the "from" frequency + int to_q_year_end; // month the year ends on in the "to" frequency +} asfreq_info; + + +typedef struct date_info { + npy_int64 absdate; + double abstime; + + double second; + int minute; + int hour; + int day; + int month; + int quarter; + int year; + int day_of_week; + int day_of_year; + int calendar; +} date_info; + +typedef npy_int64 (*freq_conv_func)(npy_int64, char, asfreq_info*); + +/* + * new pandas API helper functions here + */ + +npy_int64 asfreq(npy_int64 period_ordinal, int freq1, int freq2, char relation); + +npy_int64 get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, + int freq); + +npy_int64 get_python_ordinal(npy_int64 period_ordinal, int freq); + +int get_date_info(npy_int64 ordinal, int freq, struct date_info *dinfo); +freq_conv_func get_asfreq_func(int fromFreq, int toFreq); +void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info); + +int pyear(npy_int64 ordinal, int freq); +int pqyear(npy_int64 ordinal, int freq); +int pquarter(npy_int64 ordinal, int freq); +int pmonth(npy_int64 ordinal, int freq); +int pday(npy_int64 ordinal, int freq); +int pweekday(npy_int64 ordinal, int freq); +int pday_of_week(npy_int64 ordinal, int freq); +int pday_of_year(npy_int64 ordinal, int freq); +int pweek(npy_int64 ordinal, int freq); +int phour(npy_int64 ordinal, int freq); +int pminute(npy_int64 ordinal, int freq); +int psecond(npy_int64 ordinal, int freq); + +double getAbsTime(int freq, npy_int64 dailyDate, npy_int64 originalDate); +char *c_strftime(struct date_info *dinfo, char *fmt); +int get_yq(npy_int64 ordinal, int freq, int *quarter, int *year); + +#endif diff --git a/pandas/src/plib.pyx b/pandas/src/plib.pyx new file mode 100644 index 00000000..8e2e0f2c --- /dev/null +++ b/pandas/src/plib.pyx @@ -0,0 +1,352 @@ +# cython: profile=False + +cimport numpy as np +import numpy as np + +from numpy cimport int32_t, int64_t, import_array, ndarray +from cpython cimport * + +from libc.stdlib cimport free + +# this is our datetime.pxd +from datetime cimport * +from util cimport is_integer_object, is_datetime64_object + +from datetime import timedelta +from dateutil.parser import parse as parse_date +cimport util + +import cython + +# initialize numpy +import_array() + +# import datetime C API +PyDateTime_IMPORT + + +cdef extern from "period.h": + ctypedef struct date_info: + int64_t absdate + double abstime + double second + int minute + int hour + int day + int month + int quarter + int year + int day_of_week + int day_of_year + int calendar + + ctypedef struct asfreq_info: + int from_week_end + int to_week_end + + int from_a_year_end + int to_a_year_end + + int from_q_year_end + int to_q_year_end + + ctypedef int64_t (*freq_conv_func)(int64_t, char, asfreq_info*) + + int64_t asfreq(int64_t dtordinal, int freq1, int freq2, char relation) except INT32_MIN + freq_conv_func get_asfreq_func(int fromFreq, int toFreq) + void get_asfreq_info(int fromFreq, int toFreq, asfreq_info *af_info) + + int64_t get_period_ordinal(int year, int month, int day, + int hour, int minute, int second, + int freq) except INT32_MIN + + int64_t get_python_ordinal(int64_t period_ordinal, int freq) except INT32_MIN + + int get_date_info(int64_t ordinal, int freq, date_info *dinfo) except INT32_MIN + double getAbsTime(int, int64_t, int64_t) + + int pyear(int64_t ordinal, int freq) except INT32_MIN + int pqyear(int64_t ordinal, int freq) except INT32_MIN + int pquarter(int64_t ordinal, int freq) except INT32_MIN + int pmonth(int64_t ordinal, int freq) except INT32_MIN + int pday(int64_t ordinal, int freq) except INT32_MIN + int pweekday(int64_t ordinal, int freq) except INT32_MIN + int pday_of_week(int64_t ordinal, int freq) except INT32_MIN + int pday_of_year(int64_t ordinal, int freq) except INT32_MIN + int pweek(int64_t ordinal, int freq) except INT32_MIN + int phour(int64_t ordinal, int freq) except INT32_MIN + int pminute(int64_t ordinal, int freq) except INT32_MIN + int psecond(int64_t ordinal, int freq) except INT32_MIN + char *c_strftime(date_info *dinfo, char *fmt) + int get_yq(int64_t ordinal, int freq, int *quarter, int *year) + +# Period logic +#---------------------------------------------------------------------- + +cdef inline int64_t apply_mult(int64_t period_ord, int64_t mult): + """ + Get freq+multiple ordinal value from corresponding freq-only ordinal value. + For example, 5min ordinal will be 1/5th the 1min ordinal (rounding down to + integer). + """ + if mult == 1: + return period_ord + + return (period_ord - 1) // mult + +cdef inline int64_t remove_mult(int64_t period_ord_w_mult, int64_t mult): + """ + Get freq-only ordinal value from corresponding freq+multiple ordinal. + """ + if mult == 1: + return period_ord_w_mult + + return period_ord_w_mult * mult + 1; + +def dt64arr_to_periodarr(ndarray[int64_t] dtarr, int freq): + """ + Convert array of datetime64 values (passed in as 'i8' dtype) to a set of + periods corresponding to desired frequency, per period convention. + """ + cdef: + ndarray[int64_t] out + Py_ssize_t i, l + pandas_datetimestruct dts + + l = len(dtarr) + + out = np.empty(l, dtype='i8') + + for i in range(l): + pandas_datetime_to_datetimestruct(dtarr[i], PANDAS_FR_ns, &dts) + out[i] = get_period_ordinal(dts.year, dts.month, dts.day, + dts.hour, dts.min, dts.sec, freq) + return out + +def periodarr_to_dt64arr(ndarray[int64_t] periodarr, int freq): + """ + Convert array to datetime64 values from a set of ordinals corresponding to + periods per period convention. + """ + cdef: + ndarray[int64_t] out + Py_ssize_t i, l + + l = len(periodarr) + + out = np.empty(l, dtype='i8') + + for i in range(l): + out[i] = period_ordinal_to_dt64(periodarr[i], freq) + + return out + +cdef char START = 'S' +cdef char END = 'E' + +cpdef int64_t period_asfreq(int64_t period_ordinal, int freq1, int freq2, + bint end): + """ + Convert period ordinal from one frequency to another, and if upsampling, + choose to use start ('S') or end ('E') of period. + """ + cdef: + int64_t retval + + if end: + retval = asfreq(period_ordinal, freq1, freq2, END) + else: + retval = asfreq(period_ordinal, freq1, freq2, START) + + if retval == INT32_MIN: + raise ValueError('Frequency conversion failed') + + return retval + +def period_asfreq_arr(ndarray[int64_t] arr, int freq1, int freq2, bint end): + """ + Convert int64-array of period ordinals from one frequency to another, and + if upsampling, choose to use start ('S') or end ('E') of period. + """ + cdef: + ndarray[int64_t] result + Py_ssize_t i, n + freq_conv_func func + asfreq_info finfo + int64_t val, ordinal + char relation + + n = len(arr) + result = np.empty(n, dtype=np.int64) + + func = get_asfreq_func(freq1, freq2) + get_asfreq_info(freq1, freq2, &finfo) + + if end: + relation = END + else: + relation = START + + for i in range(n): + val = func(arr[i], relation, &finfo) + if val == INT32_MIN: + raise ValueError("Unable to convert to desired frequency.") + result[i] = val + + return result + +def period_ordinal(int y, int m, int d, int h, int min, int s, int freq): + cdef: + int64_t ordinal + + return get_period_ordinal(y, m, d, h, min, s, freq) + + +cpdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq): + cdef: + pandas_datetimestruct dts + date_info dinfo + + get_date_info(ordinal, freq, &dinfo) + + dts.year = dinfo.year + dts.month = dinfo.month + dts.day = dinfo.day + dts.hour = dinfo.hour + dts.min = dinfo.minute + dts.sec = int(dinfo.second) + dts.us = dts.ps = 0 + + return pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + +def period_format(int64_t value, int freq, object fmt=None): + cdef: + int freq_group + + if fmt is None: + freq_group = (freq // 1000) * 1000 + if freq_group == 1000: # FR_ANN + fmt = b'%Y' + elif freq_group == 2000: # FR_QTR + fmt = b'%FQ%q' + elif freq_group == 3000: # FR_MTH + fmt = b'%b-%Y' + elif (freq_group == 4000 # WK + or freq_group == 5000 # BUS + or freq_group == 6000): # DAY + fmt = b'%d-%b-%Y' + elif freq_group == 7000: # HR + fmt = b'%d-%b-%Y %H:00' + elif freq_group == 8000: # MIN + fmt = b'%d-%b-%Y %H:%M' + elif freq_group == 9000: # SEC + fmt = b'%d-%b-%Y %H:%M:%S' + else: + raise ValueError('Unknown freq: %d' % freq) + + return _period_strftime(value, freq, fmt) + + +cdef list extra_fmts = [(b"%q", b"^`AB`^"), + (b"%f", b"^`CD`^"), + (b"%F", b"^`EF`^")] + +cdef list str_extra_fmts = ["^`AB`^", "^`CD`^", "^`EF`^"] + +cdef _period_strftime(int64_t value, int freq, object fmt): + cdef: + Py_ssize_t i + date_info dinfo + char *formatted + object pat, repl, result + list found_pat = [False] * len(extra_fmts) + int year, quarter + + if PyUnicode_Check(fmt): + fmt = fmt.encode('utf-8') + + get_date_info(value, freq, &dinfo) + for i in range(len(extra_fmts)): + pat = extra_fmts[i][0] + repl = extra_fmts[i][1] + if pat in fmt: + fmt = fmt.replace(pat, repl) + found_pat[i] = True + + formatted = c_strftime(&dinfo, fmt) + + result = util.char_to_string(formatted) + free(formatted) + + for i in range(len(extra_fmts)): + if found_pat[i]: + if get_yq(value, freq, &quarter, &year) < 0: + raise ValueError('Unable to get quarter and year') + + if i == 0: + repl = '%d' % quarter + elif i == 1: # %f, 2-digit year + repl = '%.2d' % (year % 100) + elif i == 2: + repl = '%d' % year + + result = result.replace(str_extra_fmts[i], repl) + + # Py3? + if not PyString_Check(result): + result = str(result) + + return result + +# period accessors + +ctypedef int (*accessor)(int64_t ordinal, int freq) except INT32_MIN + +def get_period_field(int code, int64_t value, int freq): + cdef accessor f = _get_accessor_func(code) + return f(value, freq) + +def get_period_field_arr(int code, ndarray[int64_t] arr, int freq): + cdef: + Py_ssize_t i, sz + ndarray[int64_t] out + accessor f + + f = _get_accessor_func(code) + + sz = len(arr) + out = np.empty(sz, dtype=np.int64) + + for i in range(sz): + out[i] = f(arr[i], freq) + + return out + + + +cdef accessor _get_accessor_func(int code): + if code == 0: + return &pyear + elif code == 1: + return &pqyear + elif code == 2: + return &pquarter + elif code == 3: + return &pmonth + elif code == 4: + return &pday + elif code == 5: + return &phour + elif code == 6: + return &pminute + elif code == 7: + return &psecond + elif code == 8: + return &pweek + elif code == 9: + return &pday_of_year + elif code == 10: + return &pweekday + else: + raise ValueError('Unrecognized code: %s' % code) + diff --git a/pandas/src/properties.pyx b/pandas/src/properties.pyx new file mode 100644 index 00000000..30184979 --- /dev/null +++ b/pandas/src/properties.pyx @@ -0,0 +1,73 @@ +from cpython cimport PyDict_Contains, PyDict_GetItem, PyDict_GetItem + +cdef class cache_readonly(object): + + cdef readonly: + object fget, name + + def __init__(self, func): + self.fget = func + self.name = func.__name__ + + def __get__(self, obj, type): + if obj is None: + return self.fget + + # Get the cache or set a default one if needed + + cache = getattr(obj, '_cache', None) + if cache is None: + cache = obj._cache = {} + + if PyDict_Contains(cache, self.name): + # not necessary to Py_INCREF + val = PyDict_GetItem(cache, self.name) + return val + else: + val = self.fget(obj) + PyDict_SetItem(cache, self.name, val) + return val + +cdef class AxisProperty(object): + cdef: + Py_ssize_t axis + + def __init__(self, axis=0): + self.axis = axis + + def __get__(self, obj, type): + cdef list axes = obj._data.axes + return axes[self.axis] + + def __set__(self, obj, value): + obj._set_axis(self.axis, value) + +cdef class SeriesIndex(object): + cdef: + object _check_type + + def __init__(self): + from pandas.core.index import _ensure_index + self._check_type = _ensure_index + + def __get__(self, obj, type): + return obj._index + + def __set__(self, obj, value): + if len(obj) != len(value): + raise AssertionError('Index length did not match values') + obj._index = self._check_type(value) + +cdef class ValuesProperty(object): + + def __get__(self, obj, type): + cdef: + ndarray arr = obj + object base + + base = np.get_array_base(arr) + if base is None or not np.PyArray_CheckExact(base): + arr = arr.view(np.ndarray) + else: + arr = base + return arr diff --git a/pandas/src/reduce.pyx b/pandas/src/reduce.pyx new file mode 100644 index 00000000..367f3686 --- /dev/null +++ b/pandas/src/reduce.pyx @@ -0,0 +1,365 @@ +from numpy cimport * +import numpy as np + +cdef class Reducer: + ''' + Performs generic reduction operation on a C or Fortran-contiguous ndarray + while avoiding ndarray construction overhead + ''' + cdef: + Py_ssize_t increment, chunksize, nresults + object arr, dummy, f, labels + bint can_set_name + + def __init__(self, object arr, object f, axis=1, dummy=None, + labels=None): + n, k = arr.shape + + if axis == 0: + if not arr.flags.f_contiguous: + arr = arr.copy('F') + + self.nresults = k + self.chunksize = n + self.increment = n * arr.dtype.itemsize + else: + if not arr.flags.c_contiguous: + arr = arr.copy('C') + + self.nresults = n + self.chunksize = k + self.increment = k * arr.dtype.itemsize + + self.f = f + self.arr = arr + self.dummy = self._check_dummy(dummy) + self.labels = labels + + def _check_dummy(self, dummy=None): + if dummy is None: + dummy = np.empty(self.chunksize, dtype=self.arr.dtype) + self.can_set_name = 0 + else: + if dummy.dtype != self.arr.dtype: + raise ValueError('Dummy array must be same dtype') + if len(dummy) != self.chunksize: + raise ValueError('Dummy array must be length %d' % + self.chunksize) + self.can_set_name = type(dummy) != np.ndarray + + return dummy + + def get_result(self): + cdef: + char* dummy_buf + ndarray arr, result, chunk + Py_ssize_t i + flatiter it + object res + bint set_label = 0 + ndarray labels + + arr = self.arr + chunk = self.dummy + + dummy_buf = chunk.data + chunk.data = arr.data + + set_label = self.labels is not None and self.can_set_name + if set_label: + labels = self.labels + + try: + for i in range(self.nresults): + if set_label: + chunk.name = util.get_value_at(labels, i) + + res = self.f(chunk) + if i == 0: + result = self._get_result_array(res) + it = PyArray_IterNew(result) + + PyArray_SETITEM(result, PyArray_ITER_DATA(it), res) + chunk.data = chunk.data + self.increment + PyArray_ITER_NEXT(it) + except Exception, e: + if hasattr(e, 'args'): + e.args = e.args + (i,) + raise + finally: + # so we don't free the wrong memory + chunk.data = dummy_buf + + if result.dtype == np.object_: + result = maybe_convert_objects(result) + + return result + + def _get_result_array(self, object res): + try: + assert(not isinstance(res, np.ndarray)) + assert(not (isinstance(res, list) and len(res) == len(self.dummy))) + + result = np.empty(self.nresults, dtype='O') + result[0] = res + except Exception: + raise ValueError('function does not reduce') + return result + +cdef class SeriesBinGrouper: + ''' + Performs grouping operation according to bin edges, rather than labels + ''' + cdef: + Py_ssize_t nresults, ngroups + bint passed_dummy + + cdef public: + object arr, index, dummy, f, bins + + def __init__(self, object series, object f, object bins, object dummy): + n = len(series) + + self.bins = bins + self.f = f + if not series.flags.c_contiguous: + series = series.copy('C') + self.arr = series + self.index = series.index + + self.dummy = self._check_dummy(dummy) + self.passed_dummy = dummy is not None + self.ngroups = len(bins) + 1 + + def _check_dummy(self, dummy=None): + if dummy is None: + dummy = np.empty(0, dtype=self.arr.dtype) + else: + if dummy.dtype != self.arr.dtype: + raise ValueError('Dummy array must be same dtype') + if not dummy.flags.contiguous: + dummy = dummy.copy() + + return dummy + + def get_result(self): + cdef: + ndarray arr, result + ndarray[int64_t] counts + Py_ssize_t i, n, group_size + object res, chunk + bint initialized = 0 + Slider vslider, islider + IndexEngine gin + + counts = np.zeros(self.ngroups, dtype=np.int64) + + if self.ngroups > 1: + counts[0] = self.bins[0] + for i in range(1, self.ngroups): + if i == self.ngroups - 1: + counts[i] = len(self.arr) - self.bins[i-1] + else: + counts[i] = self.bins[i] - self.bins[i-1] + + chunk = self.dummy + group_size = 0 + n = len(self.arr) + + vslider = Slider(self.arr, self.dummy) + islider = Slider(self.index, self.dummy.index) + + gin = self.dummy.index._engine + + try: + for i in range(self.ngroups): + group_size = counts[i] + + islider.set_length(group_size) + vslider.set_length(group_size) + + res = self.f(chunk) + + if not initialized: + result = self._get_result_array(res) + initialized = 1 + + util.assign_value_1d(result, i, res) + + islider.advance(group_size) + vslider.advance(group_size) + + gin.clear_mapping() + except: + raise + finally: + # so we don't free the wrong memory + islider.cleanup() + vslider.cleanup() + + if result.dtype == np.object_: + result = maybe_convert_objects(result) + + return result, counts + + def _get_result_array(self, object res): + try: + assert(not isinstance(res, np.ndarray)) + assert(not (isinstance(res, list) and len(res) == len(self.dummy))) + + result = np.empty(self.ngroups, dtype='O') + except Exception: + raise ValueError('function does not reduce') + return result + +cdef class SeriesGrouper: + ''' + Performs generic grouping operation while avoiding ndarray construction + overhead + ''' + cdef: + Py_ssize_t nresults, ngroups + bint passed_dummy + + cdef public: + object arr, index, dummy, f, labels + + def __init__(self, object series, object f, object labels, + Py_ssize_t ngroups, object dummy): + n = len(series) + + self.labels = labels + self.f = f + if not series.flags.c_contiguous: + series = series.copy('C') + self.arr = series + self.index = series.index + + self.dummy = self._check_dummy(dummy) + self.passed_dummy = dummy is not None + self.ngroups = ngroups + + def _check_dummy(self, dummy=None): + if dummy is None: + dummy = np.empty(0, dtype=self.arr.dtype) + else: + if dummy.dtype != self.arr.dtype: + raise ValueError('Dummy array must be same dtype') + if not dummy.flags.contiguous: + dummy = dummy.copy() + + return dummy + + def get_result(self): + cdef: + ndarray arr, result + ndarray[int64_t] labels, counts + Py_ssize_t i, n, group_size, lab + object res, chunk + bint initialized = 0 + Slider vslider, islider + IndexEngine gin + + labels = self.labels + counts = np.zeros(self.ngroups, dtype=np.int64) + chunk = self.dummy + group_size = 0 + n = len(self.arr) + + vslider = Slider(self.arr, self.dummy) + islider = Slider(self.index, self.dummy.index) + + gin = self.dummy.index._engine + try: + for i in range(n): + group_size += 1 + + lab = labels[i] + + if i == n - 1 or lab != labels[i + 1]: + if lab == -1: + islider.advance(group_size) + vslider.advance(group_size) + group_size = 0 + continue + + islider.set_length(group_size) + vslider.set_length(group_size) + + res = self.f(chunk) + + if not initialized: + result = self._get_result_array(res) + initialized = 1 + + util.assign_value_1d(result, lab, res) + counts[lab] = group_size + islider.advance(group_size) + vslider.advance(group_size) + + group_size = 0 + + gin.clear_mapping() + + except: + raise + finally: + # so we don't free the wrong memory + islider.cleanup() + vslider.cleanup() + + if result.dtype == np.object_: + result = maybe_convert_objects(result) + + return result, counts + + def _get_result_array(self, object res): + try: + assert(not isinstance(res, np.ndarray)) + assert(not (isinstance(res, list) and len(res) == len(self.dummy))) + + result = np.empty(self.ngroups, dtype='O') + except Exception: + raise ValueError('function does not reduce') + return result + +cdef class Slider: + ''' + Only handles contiguous data for now + ''' + cdef: + ndarray values, buf + Py_ssize_t stride, orig_len + char *orig_data + + def __init__(self, object values, object buf): + assert(values.ndim == 1) + if not values.flags.contiguous: + values = values.copy() + + assert(values.dtype == buf.dtype) + self.values = values + self.buf = buf + self.stride = values.dtype.itemsize + + self.orig_data = self.buf.data + self.orig_len = self.buf.shape[0] + + self.buf.data = self.values.data + + cpdef advance(self, Py_ssize_t k): + self.buf.data = self.buf.data + self.stride * k + + cpdef set_length(self, Py_ssize_t length): + self.buf.shape[0] = length + + cpdef cleanup(self): + self.buf.shape[0] = self.orig_len + self.buf.data = self.orig_data + +def reduce(arr, f, axis=0, dummy=None, labels=None): + if labels._has_complex_internals: + raise Exception('Cannot use shortcut') + + reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) + return reducer.get_result() diff --git a/pandas/src/reindex.pyx b/pandas/src/reindex.pyx new file mode 100644 index 00000000..c255aa5c --- /dev/null +++ b/pandas/src/reindex.pyx @@ -0,0 +1,139 @@ +def ordered_left_join(ndarray[object] left, ndarray[object] right): + # cdef dict right_map = map_indices_buf(right) + # return merge_indexer(left, right_map) + cdef: + Py_ssize_t i, j, k, n + ndarray[int32_t] indexer + ndarray[uint8_t] mask + object val + + i = 0 + j = 0 + n = len(left) + k = len(right) + + indexer = np.zeros(n, dtype=np.int32) + mask = np.ones(n, dtype=np.uint8) + + for i from 0 <= i < n: + val = left[i] + + while j < k and right[j] < val: + j += 1 + + if j == k: + break + + if val == right[j]: + indexer[i] = j + mask[i] = 0 + + return indexer, mask.view(np.bool_) + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_2d(ndarray[int64_t] left, ndarray[int64_t] right, + ndarray[float64_t, ndim=2] lvalues, + ndarray[float64_t, ndim=2] rvalues, + ndarray[float64_t, ndim=2] out): + cdef: + Py_ssize_t i, j, k, nright, nleft, kright, kleft + int64_t val + + nleft, kleft = ( lvalues).shape + nright, kright = ( rvalues).shape + + j = 0 + for i from 0 <= i < nleft: + for k from 0 <= k < kleft: + out[i, k] = lvalues[i, k] + + val = left[i] + + while j < nright and right[j] < val: + j += 1 + + if j == nright: + for k from kleft <= k < kleft + kright: + out[i, k] = NaN + continue + + if val == right[j]: + for k from kleft <= k < kleft + kright: + out[i, k] = rvalues[j, k - kleft] + else: + for k from kleft <= k < kleft + kright: + out[i, k] = NaN + +@cython.wraparound(False) +@cython.boundscheck(False) +def left_join_1d(ndarray[int64_t] left, ndarray[int64_t] right, + ndarray[float64_t] lvalues, + ndarray[float64_t] rvalues, + ndarray[float64_t, ndim=2] out): + cdef: + Py_ssize_t i, j, nright, nleft + int64_t val + + nleft = len(lvalues) + nright = len(rvalues) + + j = 0 + for i from 0 <= i < nleft: + out[i, 0] = lvalues[i] + + val = left[i] + + while j < nright and right[j] < val: + j += 1 + + if j == nright: + out[i, 1] = NaN + continue + + if val == right[j]: + out[i, 1] = rvalues[j] + else: + out[i, 1] = NaN + + +@cython.wraparound(False) +@cython.boundscheck(False) +def take_join_contiguous(ndarray[float64_t, ndim=2] lvalues, + ndarray[float64_t, ndim=2] rvalues, + ndarray[int32_t] lindexer, + ndarray[int32_t] rindexer, + ndarray out): + cdef: + Py_ssize_t i, j, rk, lk, n, lidx, ridx + float64_t *outbuf + + assert(out.flags.contiguous) + + outbuf = out.data + + n = len(lindexer) + lk = lvalues.shape[1] + rk = rvalues.shape[1] + + for i from 0 <= i < n: + lidx = lindexer[i] + ridx = rindexer[i] + + if lidx == -1: + for j from 0 <= j < lk: + outbuf[0] = NaN + outbuf = outbuf + 1 + else: + for j from 0 <= j < lk: + outbuf[0] = lvalues[lidx, j] + outbuf = outbuf + 1 + + if lidx == -1: + for j from 0 <= j < rk: + outbuf[0] = NaN + outbuf = outbuf + 1 + else: + for j from 0 <= j < rk: + outbuf[0] = rvalues[ridx, j] + outbuf = outbuf + 1 diff --git a/pandas/src/sandbox.pyx b/pandas/src/sandbox.pyx new file mode 100644 index 00000000..dabeb7cf --- /dev/null +++ b/pandas/src/sandbox.pyx @@ -0,0 +1,500 @@ +# cython: wraparound=False +# cython: boundscheck=False + +from numpy cimport * +cimport numpy as cnp +import numpy as np + +from cpython cimport * +cimport cpython + +cnp.import_array() + +cdef class SeriesIterator: + + def __init__(self, arr): + pass + + def next(self): + pass + +def foo(object o): + cdef int64_t bar = o + return bar + +def foo2(): + print sizeof(PyObject*) + +def bench_dict(): + cdef: + # Py_ssize_t i + dict d = {} + + for i in range(1000000): + d[i] = i + +from cpython cimport PyObject + +# cdef extern from "numpy/arrayobject.h": +# bint PyArray_Check(PyObject*) + +cimport cython + +@cython.boundscheck(False) +@cython.wraparound(False) +def bench_typecheck1(ndarray[object] arr): + cdef Py_ssize_t i, n + n = cnp.PyArray_SIZE(arr) + for i in range(n): + cpython.PyFloat_Check(arr[i]) + +# def bench_typecheck2(ndarray[object] arr): +# cdef Py_ssize_t i, n +# cdef PyObject** buf = arr.data +# n = cnp.PyArray_SIZE(arr) +# for i in range(n): +# PyArray_Check(buf[i]) + + + +from skiplist cimport * + +def sl_test(): + cdef int ret + + np.random.seed(12345) + n = 100 + + cdef skiplist_t* skp = skiplist_init(n) + + arr = np.random.randn(n) + + for i in range(n): + print i + skiplist_insert(skp, arr[i]) + # val = skiplist_get(skp, 0, &ret) + # if ret == 0: + # raise ValueError('%d out of bounds' % i) + + if i >= 20: + skiplist_remove(skp, arr[i-20]) + + # skiplist_remove(skp, arr[i]) + # print 'Skiplist begin: %s' % skiplist_get(skp, 0) + # print 'Actual begin: %s' % sorted(arr[:i+1])[0] + data = arr[max(i-19, 0):i+1] + print 'Skiplist middle: %s' % skiplist_get(skp, len(data) // 2, &ret) + print 'Actual middle: %s' % sorted(data)[len(data) // 2] + + skiplist_destroy(skp) + +cdef double NaN = np.NaN + +def _check_minp(minp, N): + if minp > N: + minp = N + 1 + elif minp == 0: + minp = 1 + elif minp < 0: + raise ValueError('min_periods must be >= 0') + return minp + +cdef extern from "Python.h": + bint PyDict_Contains(object, PyObject*) + PyObject* PyDict_GetItem(object, PyObject*) + long PyInt_AS_LONG(PyObject*) + +def get_indexer(ndarray values, dict mapping): + cdef: + Py_ssize_t i, length + ndarray fill_vec + PyObject **buf + int32_t *resbuf + PyObject* val + + length = len(values) + buf = values.data + fill_vec = np.empty(length, dtype='i4') + resbuf = fill_vec.data + + for i in range(length): + val = buf[i] + if PyDict_Contains(mapping, val): + resbuf[i] = PyInt_AS_LONG(PyDict_GetItem(mapping, val)) + else: + resbuf[i] = -1 + return fill_vec + + +@cython.wraparound(False) +@cython.boundscheck(False) +def fancy_inc(ndarray[int64_t, ndim=2] values, + ndarray[int64_t] iarr, ndarray[int64_t] jarr, int64_t inc): + cdef: + Py_ssize_t i, n = len(iarr) + + for i in range(n): + values[iarr[i], jarr[i]] += inc + + + +# def foo2(o): +# return util.is_integer_object(o) + +# def foo3(o): +# return util.get_base_ndarray(o) + + +cimport util + +from khash cimport * + +cdef class Int64HashTable: + + cdef: + kh_int64_t *table + + def __init__(self, size_hint=1): + if size_hint is not None: + kh_resize_int64(self.table, size_hint) + + def __cinit__(self): + self.table = kh_init_int64() + + def __dealloc__(self): + kh_destroy_int64(self.table) + + @cython.boundscheck(False) + @cython.wraparound(False) + def get_labels(self, ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + ndarray[int32_t] labels + Py_ssize_t idx, count = 0 + int ret = 0 + int64_t val + khiter_t k + + labels = np.empty(n, dtype=np.int32) + + for i in range(n): + val = values[i] + k = kh_get_int64(self.table, val) + if k != self.table.n_buckets: + idx = self.table.vals[k] + labels[i] = idx + else: + k = kh_put_int64(self.table, val, &ret) + self.table.vals[k] = count + labels[i] = count + count += 1 + + return labels + +#---------------------------------------------------------------------- +# isnull / notnull related + +cdef double INF = np.inf +cdef double NEGINF = -INF + +cdef inline bint _checknull(object val): + return not np.PyArray_Check(val) and (val is None or val != val) + +cdef inline bint _checknan(object val): + return not np.PyArray_Check(val) and val != val + +cpdef checknull(object val): + if util.is_float_object(val): + return val != val or val == INF or val == NEGINF + elif is_array(val): + return False + else: + return _checknull(val) + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj(ndarray[object] arr): + cdef Py_ssize_t i, n + cdef object val + cdef ndarray[uint8_t] result + + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + result[i] = _checknull(arr[i]) + return result.view(np.bool_) + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj2d(ndarray[object, ndim=2] arr): + cdef Py_ssize_t i, j, n, m + cdef object val + cdef ndarray[uint8_t, ndim=2] result + + n, m = ( arr).shape + result = np.zeros((n, m), dtype=np.uint8) + for i from 0 <= i < n: + for j from 0 <= j < m: + val = arr[i, j] + if checknull(val): + result[i, j] = 1 + return result.view(np.bool_) + +from util cimport is_array + +from numpy import nan + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +cdef float64_t FP_ERR = 1e-13 + +cimport util + +cdef: + int TIEBREAK_AVERAGE = 0 + int TIEBREAK_MIN = 1 + int TIEBREAK_MAX = 2 + int TIEBREAK_FIRST = 3 + +tiebreakers = { + 'average' : TIEBREAK_AVERAGE, + 'min' : TIEBREAK_MIN, + 'max' : TIEBREAK_MAX, + 'first' : TIEBREAK_FIRST +} + +from khash cimport * + +def test(ndarray arr, Py_ssize_t size_hint): + cdef: + kh_pymap_t *table + int ret = 0 + khiter_t k + PyObject **data + Py_ssize_t i, n + ndarray[Py_ssize_t] indexer + + table = kh_init_pymap() + kh_resize_pymap(table, size_hint) + + data = arr.data + n = len(arr) + + indexer = np.empty(n, dtype=np.int_) + + for i in range(n): + k = kh_put_pymap(table, data[i], &ret) + + # if not ret: + # kh_del_pymap(table, k) + + table.vals[k] = i + + for i in range(n): + k = kh_get_pymap(table, data[i]) + indexer[i] = table.vals[k] + + kh_destroy_pymap(table) + + return indexer + + +def test_str(ndarray arr, Py_ssize_t size_hint): + cdef: + kh_str_t *table + kh_cstr_t val + int ret = 0 + khiter_t k + PyObject **data + Py_ssize_t i, n + ndarray[Py_ssize_t] indexer + + table = kh_init_str() + kh_resize_str(table, size_hint) + + data = arr.data + n = len(arr) + + indexer = np.empty(n, dtype=np.int_) + + for i in range(n): + k = kh_put_str(table, util.get_c_string( data[i]), &ret) + + # if not ret: + # kh_del_str(table, k) + + table.vals[k] = i + + # for i in range(n): + # k = kh_get_str(table, PyString_AsString( data[i])) + # indexer[i] = table.vals[k] + + kh_destroy_str(table) + + return indexer + +# def test2(ndarray[object] arr): +# cdef: +# dict table +# object obj +# Py_ssize_t i, loc, n +# ndarray[Py_ssize_t] indexer + +# n = len(arr) +# indexer = np.empty(n, dtype=np.int_) + +# table = {} +# for i in range(n): +# table[arr[i]] = i + +# for i in range(n): +# indexer[i] = table[arr[i]] + +# return indexer + +def obj_unique(ndarray[object] arr): + cdef: + kh_pyset_t *table + # PyObject *obj + object obj + PyObject **data + int ret = 0 + khiter_t k + Py_ssize_t i, n + list uniques + + n = len(arr) + uniques = [] + + table = kh_init_pyset() + + data = arr.data + + # size hint + kh_resize_pyset(table, n // 10) + + for i in range(n): + obj = arr[i] + + k = kh_get_pyset(table, obj) + if not kh_exist_pyset(table, k): + k = kh_put_pyset(table, obj, &ret) + # uniques.append(obj) + # Py_INCREF( obj) + + kh_destroy_pyset(table) + + return None + +def int64_unique(ndarray[int64_t] arr): + cdef: + kh_int64_t *table + # PyObject *obj + int64_t obj + PyObject **data + int ret = 0 + khiter_t k + Py_ssize_t i, j, n + ndarray[int64_t] uniques + + n = len(arr) + uniques = np.empty(n, dtype='i8') + + table = kh_init_int64() + kh_resize_int64(table, n) + + j = 0 + + for i in range(n): + obj = arr[i] + + k = kh_get_int64(table, obj) + if not kh_exist_int64(table, k): + k = kh_put_int64(table, obj, &ret) + uniques[j] = obj + j += 1 + # Py_INCREF( obj) + + kh_destroy_int64(table) + + return np.sort(uniques[:j]) + + +# cdef extern from "kvec.h": + +# ctypedef struct kv_int64_t: +# size_t n, m +# int64_t *a + + +def test_foo(ndarray[int64_t] values): + cdef int64_t val + + val = values[0] + print val + +# cdef extern from "foo.h": +# double add_things(double *a, double *b, double *c, int n) + + +# def cython_test(ndarray a, ndarray b, ndarray c): +# return add_things( a.data, +# b.data, +# c.data, len(a)) + + +# def cython_test2(ndarray[float64_t] a, ndarray[float64_t] b, +# ndarray[float64_t] c): +# cdef: +# Py_ssize_t i, n = len(a) +# float64_t result = 0 + +# for i in range(n): +# result += a[i] + b[i] + c[i] + +# return result + +@cython.boundscheck(False) +@cython.wraparound(False) +def inner(ndarray[float64_t] x, ndarray[float64_t] y): + cdef Py_ssize_t i, n = len(x) + cdef float64_t result = 0 + for i in range(n): + result += x[i] * y[i] + return result + +def indices_fast(ndarray[int64_t] labels, list keys, + list sorted_labels): + cdef: + Py_ssize_t i, j, k, lab, cur, start, n = len(labels) + dict result = {} + object tup + + index = np.arange(n) + + k = len(keys) + + if n == 0: + return result + + start = 0 + cur = labels[0] + for i in range(1, n): + lab = labels[i] + + if lab != cur: + if lab != -1: + tup = PyTuple_New(k) + for j in range(k): + val = util.get_value_at(keys[j], + sorted_labels[j][cur]) + PyTuple_SET_ITEM(tup, j, val) + Py_INCREF(val) + + result[tup] = index[start:i] + start = i + cur = lab + + return result diff --git a/pandas/src/skiplist.h b/pandas/src/skiplist.h new file mode 100644 index 00000000..57b32005 --- /dev/null +++ b/pandas/src/skiplist.h @@ -0,0 +1,281 @@ + +/* + Flexibly-sized, indexable skiplist data structure for maintaining a sorted + list of values + + Port of Wes McKinney's Cython version of Raymond Hettinger's original pure + Python recipe (http://rhettinger.wordpress.com/2010/02/06/lost-knowledge/) + */ + +// #include +// #include + + +#include +#include +#include +#include + +#ifndef PANDAS_INLINE + #if defined(__GNUC__) + #define PANDAS_INLINE __inline__ + #elif defined(_MSC_VER) + #define PANDAS_INLINE __inline + #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + #define PANDAS_INLINE inline + #else + #define PANDAS_INLINE + #endif +#endif + +PANDAS_INLINE static float __skiplist_nanf(void) +{ + const union { int __i; float __f;} __bint = {0x7fc00000UL}; + return __bint.__f; +} +#define PANDAS_NAN ((double) __skiplist_nanf()) + + +static PANDAS_INLINE double Log2(double val) { + return log(val) / log(2.); +} + +typedef struct node_t node_t; + +struct node_t { + double value; + int is_nil; + int levels; + node_t **next; + int *width; + int ref_count; +}; + +typedef struct { + node_t *head; + int size, maxlevels; + node_t **tmp_chain; + int *tmp_steps; +} skiplist_t; + +static PANDAS_INLINE double urand(void) { + return rand() / ((double) RAND_MAX + 1); +} + +static PANDAS_INLINE int int_min(int a, int b) { + return a < b ? a : b; +} + +static PANDAS_INLINE node_t *node_init(double value, int levels) { + node_t *result; + result = (node_t*) calloc(1, sizeof(node_t)); + + result->value = value; + result->levels = levels; + result->is_nil = 0; + result->ref_count = 0; + + result->next = (node_t**) malloc(levels * sizeof(node_t*)); + result->width = (int*) malloc(levels * sizeof(int)); + + return result; +} + +// do this ourselves + +static PANDAS_INLINE void node_incref(node_t *node) { + node->ref_count += 1; +} + +static PANDAS_INLINE void node_decref(node_t *node) { + node->ref_count -= 1; +} + +static void node_destroy(node_t *node) { + int i; + if (node) { + if (node->ref_count == 1) { + for (i = 0; i < node->levels; ++i) { + node_destroy(node->next[i]); + } + free(node->next); + free(node->width); + // printf("Reference count was 1, freeing\n"); + free(node); + } + else { + node_decref(node); + } + // pretty sure that freeing the struct above will be enough + } +} + +static PANDAS_INLINE skiplist_t *skiplist_init(int expected_size) { + skiplist_t *result; + node_t *NIL, *head; + int maxlevels, i; + + maxlevels = Log2((double) expected_size); + result = (skiplist_t*) calloc(1, sizeof(skiplist_t)); + result->tmp_chain = (node_t**) malloc(maxlevels * sizeof(node_t*)); + result->tmp_steps = (int*) malloc(maxlevels * sizeof(int)); + result->maxlevels = maxlevels; + + head = result->head = node_init(PANDAS_NAN, maxlevels); + node_incref(head); + + NIL = node_init(0, 0); + NIL->is_nil = 1; + + for (i = 0; i < maxlevels; ++i) + { + head->next[i] = NIL; + head->width[i] = 1; + node_incref(NIL); + } + + return result; +} + +static PANDAS_INLINE void skiplist_destroy(skiplist_t *skp) { + if (skp) { + node_destroy(skp->head); + free(skp->tmp_steps); + free(skp->tmp_chain); + free(skp); + } +} + + +// 1 if left < right, 0 if left == right, -1 if left > right + +static PANDAS_INLINE int _node_cmp(node_t* node, double value){ + if (node->is_nil || node->value > value) { + return -1; + } + else if (node->value < value) { + return 1; + } + else { + return 0; + } +} + +static PANDAS_INLINE double skiplist_get(skiplist_t *skp, int i, int *ret) { + node_t *node; + int level; + + if (i < 0 || i >= skp->size) { + *ret = 0; + return 0; + } + + node = skp->head; + i++; + for (level = skp->maxlevels - 1; level >= 0; --level) + { + while (node->width[level] <= i) + { + i = i - node->width[level]; + node = node->next[level]; + } + } + + *ret = 1; + return node->value; +} + +static PANDAS_INLINE int skiplist_insert(skiplist_t *skp, double value) { + node_t *node, *prevnode, *newnode, *next_at_level; + int *steps_at_level; + int size, steps, level; + node_t **chain; + + chain = skp->tmp_chain; + + steps_at_level = skp->tmp_steps; + memset(steps_at_level, 0, skp->maxlevels * sizeof(int)); + + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) + { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) >= 0) { + steps_at_level[level] += node->width[level]; + node = next_at_level; + next_at_level = node->next[level]; + } + chain[level] = node; + } + + size = int_min(skp->maxlevels, 1 - ((int) Log2(urand()))); + + newnode = node_init(value, size); + steps = 0; + + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + newnode->next[level] = prevnode->next[level]; + + prevnode->next[level] = newnode; + node_incref(newnode); // increment the reference count + + newnode->width[level] = prevnode->width[level] - steps; + prevnode->width[level] = steps + 1; + + steps += steps_at_level[level]; + } + + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] += 1; + } + + skp->size++; + + return 1; +} + +static PANDAS_INLINE int skiplist_remove(skiplist_t *skp, double value) { + int level, size; + node_t *node, *prevnode, *tmpnode, *next_at_level; + node_t **chain; + + chain = skp->tmp_chain; + node = skp->head; + + for (level = skp->maxlevels - 1; level >= 0; --level) + { + next_at_level = node->next[level]; + while (_node_cmp(next_at_level, value) > 0) { + node = next_at_level; + next_at_level = node->next[level]; + } + chain[level] = node; + } + + if (value != chain[0]->next[0]->value) { + return 0; + } + + size = chain[0]->next[0]->levels; + + for (level = 0; level < size; ++level) { + prevnode = chain[level]; + + tmpnode = prevnode->next[level]; + + prevnode->width[level] += tmpnode->width[level] - 1; + prevnode->next[level] = tmpnode->next[level]; + + tmpnode->next[level] = NULL; + node_destroy(tmpnode); // decrement refcount or free + } + + for (level = size; level < skp->maxlevels; ++level) { + chain[level]->width[level] -= 1; + } + + skp->size--; + return 1; +} diff --git a/pandas/src/skiplist.pxd b/pandas/src/skiplist.pxd new file mode 100644 index 00000000..c1221c47 --- /dev/null +++ b/pandas/src/skiplist.pxd @@ -0,0 +1,21 @@ +cdef extern from "skiplist.h": + ctypedef struct node_t: + double value + int is_nil + int levels + node_t **next + int *width + int ref_count + + ctypedef struct skiplist_t: + node_t *head + int size, maxlevels + node_t **tmp_chain + int *tmp_steps + + inline skiplist_t* skiplist_init(int) + inline void skiplist_destroy(skiplist_t*) + inline double skiplist_get(skiplist_t*, int, int*) + inline int skiplist_insert(skiplist_t*, double) + inline int skiplist_remove(skiplist_t*, double) + diff --git a/pandas/src/skiplist.pyx b/pandas/src/skiplist.pyx new file mode 100644 index 00000000..4e00fd27 --- /dev/null +++ b/pandas/src/skiplist.pyx @@ -0,0 +1,153 @@ +# Cython version of IndexableSkiplist, for implementing moving median +# with O(log n) updates +# Original author: Raymond Hettinger +# Original license: MIT +# Link: http://code.activestate.com/recipes/576930/ + +# Cython version: Wes McKinney + +cdef extern from "numpy/arrayobject.h": + + void import_array() + +cdef extern from "math.h": + double log(double x) + +# MSVC does not have log2! + +cdef double Log2(double x): + return log(x) / log(2.) + +cimport numpy as np +from numpy cimport * +import numpy as np + +from random import random + +# initialize numpy +import_array() + +# TODO: optimize this, make less messy + +cdef class Node: + cdef public: + double_t value + list next + list width + + def __init__(self, double_t value, list next, list width): + self.value = value + self.next = next + self.width = width + +# Singleton terminator node +NIL = Node(np.inf, [], []) + +cdef class IndexableSkiplist: + ''' + Sorted collection supporting O(lg n) insertion, removal, and + lookup by rank. + ''' + cdef: + Py_ssize_t size, maxlevels + Node head + + def __init__(self, expected_size=100): + self.size = 0 + self.maxlevels = int(1 + Log2(expected_size)) + self.head = Node(np.NaN, [NIL] * self.maxlevels, [1] * self.maxlevels) + + def __len__(self): + return self.size + + def __getitem__(self, i): + return self.get(i) + + cpdef get(self, Py_ssize_t i): + cdef Py_ssize_t level + cdef Node node + + node = self.head + i += 1 + + for level in range(self.maxlevels - 1, -1, -1): + while node.width[level] <= i: + i -= node.width[level] + node = node.next[level] + + + return node.value + + cpdef insert(self, double value): + cdef Py_ssize_t level, steps, d + cdef Node node, prevnode, newnode, next_at_level, tmp + cdef list chain, steps_at_level + + # find first node on each level where node.next[levels].value > value + chain = [None] * self.maxlevels + steps_at_level = [0] * self.maxlevels + node = self.head + + for level in range(self.maxlevels - 1, -1, -1): + next_at_level = node.next[level] + + while next_at_level.value <= value: + steps_at_level[level] = (steps_at_level[level] + + node.width[level]) + node = next_at_level + next_at_level = node.next[level] + + chain[level] = node + + # insert a link to the newnode at each level + d = min(self.maxlevels, 1 - int(Log2(random()))) + newnode = Node(value, [None] * d, [None] * d) + steps = 0 + + for level in range(d): + prevnode = chain[level] + newnode.next[level] = prevnode.next[level] + prevnode.next[level] = newnode + newnode.width[level] = (prevnode.width[level] - steps) + prevnode.width[level] = steps + 1 + steps += steps_at_level[level] + + for level in range(d, self.maxlevels): + ( chain[level]).width[level] += 1 + + self.size += 1 + + cpdef remove(self, double value): + cdef Py_ssize_t level, d + cdef Node node, prevnode, tmpnode, next_at_level + cdef list chain + + # find first node on each level where node.next[levels].value >= value + chain = [None] * self.maxlevels + node = self.head + + for level in range(self.maxlevels - 1, -1, -1): + next_at_level = node.next[level] + while next_at_level.value < value: + node = next_at_level + next_at_level = node.next[level] + + chain[level] = node + + if value != ( ( ( chain[0]).next)[0]).value: + raise KeyError('Not Found') + + # remove one link at each level + d = len(( ( ( chain[0]).next)[0]).next) + + for level in range(d): + prevnode = chain[level] + tmpnode = prevnode.next[level] + prevnode.width[level] += tmpnode.width[level] - 1 + prevnode.next[level] = tmpnode.next[level] + + for level in range(d, self.maxlevels): + tmpnode = chain[level] + tmpnode.width[level] -= 1 + + self.size -= 1 diff --git a/pandas/src/sparse.pyx b/pandas/src/sparse.pyx new file mode 100644 index 00000000..19ff2df2 --- /dev/null +++ b/pandas/src/sparse.pyx @@ -0,0 +1,1186 @@ +from numpy cimport ndarray, int32_t, float64_t +cimport numpy as np + +cimport cython + +import numpy as np +import operator +import sys + +np.import_array() +np.import_ufunc() + +#------------------------------------------------------------------------------- +# Preamble stuff + +cdef float64_t NaN = np.NaN +cdef float64_t INF = np.inf + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +#------------------------------------------------------------------------------- + + +cdef class SparseIndex: + ''' + Abstract superclass for sparse index types + ''' + def __init__(self): + raise NotImplementedError + + +cdef class IntIndex(SparseIndex): + ''' + Object for holding exact integer sparse indexing information + + Parameters + ---------- + length : integer + indices : array-like + Contains integers corresponding to + ''' + cdef readonly: + Py_ssize_t length, npoints + ndarray indices + + def __init__(self, Py_ssize_t length, indices): + self.length = length + self.indices = np.ascontiguousarray(indices, dtype=np.int32) + self.npoints = len(self.indices) + + def __reduce__(self): + args = (self.length, self.indices) + return (IntIndex, args) + + def __repr__(self): + output = 'IntIndex\n' + output += 'Indices: %s\n' % repr(self.indices) + return output + + def check_integrity(self): + ''' + Only need be strictly ascending and nothing less than 0 or greater than + totall ength + ''' + pass + + def equals(self, other): + if not isinstance(other, IntIndex): + return False + + if self is other: + return True + + same_length = self.length == other.length + same_indices = np.array_equal(self.indices, other.indices) + return same_length and same_indices + + @property + def ngaps(self): + return self.length - self.npoints + + def to_int_index(self): + return self + + def to_block_index(self): + locs, lens = get_blocks(self.indices) + return BlockIndex(self.length, locs, lens) + + cpdef IntIndex intersect(self, SparseIndex y_): + cdef: + Py_ssize_t out_length, xi, yi = 0 + int32_t xind + ndarray[int32_t, ndim=1] xindices, yindices + list new_list = [] + IntIndex y + + # if is one already, returns self + y = y_.to_int_index() + + if self.length != y.length: + raise Exception('Indices must reference same underlying length') + + xindices = self.indices + yindices = y.indices + + for xi from 0 <= xi < self.npoints: + xind = xindices[xi] + + while yi < y.npoints and yindices[yi] < xind: + yi += 1 + + if yi >= y.npoints: + break + + # TODO: would a two-pass algorithm be faster? + if yindices[yi] == xind: + new_list.append(xind) + + return IntIndex(self.length, new_list) + + cpdef IntIndex make_union(self, SparseIndex y_): + cdef: + Py_ssize_t out_length, i, xi, yi + int32_t xind + ndarray[int32_t, ndim=1] xindices, yindices + list new_list = [] + IntIndex x, y + + x = self + + # if is one already, returns self + y = y_.to_int_index() + + if self.length != y.length: + raise Exception('Indices must reference same underlying length') + + xindices = self.indices + yindices = y.indices + + xi = yi = 0 + while True: + if xi == x.npoints: + while yi < y.npoints: + new_list.append(yindices[yi]) + yi += 1 + break + elif yi == y.npoints: + while xi < x.npoints: + new_list.append(xindices[xi]) + xi += 1 + break + + xind = xindices[xi] + yind = yindices[yi] + + if xind == yind: + new_list.append(xind) + xi += 1 + yi += 1 + elif xind < yind: + new_list.append(xind) + xi += 1 + else: + new_list.append(yind) + yi += 1 + + return IntIndex(x.length, new_list) + + @cython.wraparound(False) + cpdef lookup(self, Py_ssize_t index): + cdef: + Py_ssize_t res, n, cum_len = 0 + ndarray[int32_t, ndim=1] inds + + inds = self.indices + res = inds.searchsorted(index) + if res == self.npoints: + return -1 + elif inds[res] == index: + return res + else: + return -1 + + cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values, + float64_t fill_value, SparseIndex other_): + cdef: + Py_ssize_t i = 0, j = 0 + IntIndex other + ndarray[float64_t, ndim=1] result + ndarray[int32_t, ndim=1] sinds, oinds + + other = other_.to_int_index() + + oinds = other.indices + sinds = self.indices + + result = np.empty(other.npoints, dtype=np.float64) + result.fill(fill_value) + + for 0 <= i < other.npoints: + while oinds[i] > sinds[j] and j < self.npoints: + j += 1 + + if j == self.npoints: + break + + if oinds[i] < sinds[j]: + continue + elif oinds[i] == sinds[j]: + result[i] = values[j] + j += 1 + + return result + + cpdef put(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices, object to_put): + pass + + cpdef take(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices): + pass + +cpdef get_blocks(ndarray[int32_t, ndim=1] indices): + cdef: + Py_ssize_t i, npoints + int32_t block, length = 1, cur, prev + list locs = [], lens = [] + + npoints = len(indices) + + # just handle the special empty case separately + if npoints == 0: + return [], [] + + # TODO: two-pass algorithm faster? + prev = block = indices[0] + for i from 1 <= i < npoints: + cur = indices[i] + if cur - prev > 1: + # new block + locs.append(block) + lens.append(length) + block = cur + length = 1 + else: + # same block, increment length + length += 1 + + prev = cur + + locs.append(block) + lens.append(length) + return locs, lens + +#------------------------------------------------------------------------------- +# BlockIndex + +cdef class BlockIndex(SparseIndex): + ''' + Object for holding block-based sparse indexing information + + Parameters + ---------- + ''' + cdef readonly: + Py_ssize_t nblocks, npoints, length + ndarray blocs, blengths + + cdef: + object __weakref__ # need to be picklable + int32_t* locbuf, *lenbuf + + def __init__(self, length, blocs, blengths): + + self.blocs = np.ascontiguousarray(blocs, dtype=np.int32) + self.blengths = np.ascontiguousarray(blengths, dtype=np.int32) + + # in case we need + self.locbuf = self.blocs.data + self.lenbuf = self.blengths.data + + self.length = length + self.nblocks = len(self.blocs) + self.npoints = self.blengths.sum() + + # self.block_start = blocs + # self.block_end = blocs + blengths + + self.check_integrity() + + def __reduce__(self): + args = (self.length, self.blocs, self.blengths) + return (BlockIndex, args) + + def __repr__(self): + output = 'BlockIndex\n' + output += 'Block locations: %s\n' % repr(self.blocs) + output += 'Block lengths: %s' % repr(self.blengths) + + return output + + @property + def ngaps(self): + return self.length - self.npoints + + cpdef check_integrity(self): + ''' + Check: + - Locations are in ascending order + - No overlapping blocks + - Blocks to not start after end of index, nor extend beyond end + ''' + cdef: + Py_ssize_t i + ndarray[int32_t, ndim=1] blocs, blengths + + blocs = self.blocs + blengths = self.blengths + + if len(blocs) != len(blengths): + raise ValueError('block bound arrays must be same length') + + for i from 0 <= i < self.nblocks: + if i > 0: + if blocs[i] <= blocs[i-1]: + raise ValueError('Locations not in ascending order') + + if i < self.nblocks - 1: + if blocs[i] + blengths[i] > blocs[i + 1]: + raise ValueError('Block %d overlaps' % i) + else: + if blocs[i] + blengths[i] > self.length: + raise ValueError('Block %d extends beyond end' % i) + + # no zero-length blocks + if blengths[i] == 0: + raise ValueError('Zero-length block %d' % i) + + def equals(self, other): + if not isinstance(other, BlockIndex): + return False + + if self is other: + return True + + same_length = self.length == other.length + same_blocks = (np.array_equal(self.blocs, other.blocs) and + np.array_equal(self.blengths, other.blengths)) + return same_length and same_blocks + + def to_block_index(self): + return self + + def to_int_index(self): + cdef: + Py_ssize_t i = 0, j, b + int32_t offset + ndarray[int32_t, ndim=1] indices + + indices = np.empty(self.npoints, dtype=np.int32) + + for b from 0 <= b < self.nblocks: + offset = self.locbuf[b] + + for j from 0 <= j < self.lenbuf[b]: + indices[i] = offset + j + i += 1 + + return IntIndex(self.length, indices) + + cpdef BlockIndex intersect(self, SparseIndex other): + ''' + Intersect two BlockIndex objects + + Parameters + ---------- + + Returns + ------- + intersection : BlockIndex + ''' + cdef: + BlockIndex y + ndarray[int32_t, ndim=1] xloc, xlen, yloc, ylen + + list out_blocs = [] + list out_blengths = [] + + Py_ssize_t xi = 0, yi = 0 + int32_t cur_loc, cur_length, diff + + y = other.to_block_index() + + if self.length != y.length: + raise Exception('Indices must reference same underlying length') + + xloc = self.blocs + xlen = self.blengths + yloc = y.blocs + ylen = y.blengths + + while True: + # we are done (or possibly never began) + if xi >= self.nblocks or yi >= y.nblocks: + break + + # completely symmetric...would like to avoid code dup but oh well + if xloc[xi] >= yloc[yi]: + cur_loc = xloc[xi] + diff = xloc[xi] - yloc[yi] + + if ylen[yi] <= diff: + # have to skip this block + yi += 1 + continue + + if ylen[yi] - diff < xlen[xi]: + # take end of y block, move onward + cur_length = ylen[yi] - diff + yi += 1 + else: + # take end of x block + cur_length = xlen[xi] + xi += 1 + + else: # xloc[xi] < yloc[yi] + cur_loc = yloc[yi] + diff = yloc[yi] - xloc[xi] + + if xlen[xi] <= diff: + # have to skip this block + xi += 1 + continue + + if xlen[xi] - diff < ylen[yi]: + # take end of x block, move onward + cur_length = xlen[xi] - diff + xi += 1 + else: + # take end of y block + cur_length = ylen[yi] + yi += 1 + + out_blocs.append(cur_loc) + out_blengths.append(cur_length) + + return BlockIndex(self.length, out_blocs, out_blengths) + + cpdef BlockIndex make_union(self, SparseIndex y): + ''' + Combine together two BlockIndex objects, accepting indices if contained + in one or the other + + Parameters + ---------- + other : SparseIndex + + Notes + ----- + union is a protected keyword in Cython, hence make_union + + Returns + ------- + union : BlockIndex + ''' + return BlockUnion(self, y.to_block_index()).result + + cpdef lookup(self, Py_ssize_t index): + ''' + + Returns -1 if not found + ''' + cdef: + Py_ssize_t i, cum_len + ndarray[int32_t, ndim=1] locs, lens + + locs = self.blocs + lens = self.blengths + + if self.nblocks == 0: + return -1 + elif index < locs[0]: + return -1 + + cum_len = 0 + for i from 0 <= i < self.nblocks: + if index >= locs[i] and index < locs[i] + lens[i]: + return cum_len + index - locs[i] + cum_len += lens[i] + + return -1 + + cpdef ndarray reindex(self, ndarray[float64_t, ndim=1] values, + float64_t fill_value, SparseIndex other_): + cdef: + Py_ssize_t i = 0, j = 0, ocur, ocurlen + BlockIndex other + ndarray[float64_t, ndim=1] result + ndarray[int32_t, ndim=1] slocs, slens, olocs, olens + + other = other_.to_block_index() + + olocs = other.blocs + olens = other.blengths + slocs = self.blocs + slens = self.blengths + + result = np.empty(other.npoints, dtype=np.float64) + + for 0 <= i < other.nblocks: + ocur = olocs[i] + ocurlen = olens[i] + + while slocs[j] + slens[j] < ocur: + j += 1 + + cpdef put(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices, object to_put): + pass + + cpdef take(self, ndarray[float64_t, ndim=1] values, + ndarray[int32_t, ndim=1] indices): + pass + + +cdef class BlockMerge(object): + ''' + Object-oriented approach makes sharing state between recursive functions a + lot easier and reduces code duplication + ''' + cdef: + BlockIndex x, y, result + ndarray xstart, xlen, xend, ystart, ylen, yend + int32_t xi, yi # block indices + + def __init__(self, BlockIndex x, BlockIndex y): + self.x = x + self.y = y + + if x.length != y.length: + raise Exception('Indices must reference same underlying length') + + self.xstart = self.x.blocs + self.ystart = self.y.blocs + + self.xend = self.x.blocs + self.x.blengths + self.yend = self.y.blocs + self.y.blengths + + # self.xlen = self.x.blengths + # self.ylen = self.y.blengths + + self.xi = 0 + self.yi = 0 + + self.result = self._make_merged_blocks() + + cdef _make_merged_blocks(self): + raise NotImplementedError + + cdef _set_current_indices(self, int32_t xi, int32_t yi, bint mode): + if mode == 0: + self.xi = xi + self.yi = yi + else: + self.xi = yi + self.yi = xi + +cdef class BlockIntersection(BlockMerge): + ''' + not done yet + ''' + pass + +cdef class BlockUnion(BlockMerge): + ''' + Object-oriented approach makes sharing state between recursive functions a + lot easier and reduces code duplication + ''' + + cdef _make_merged_blocks(self): + cdef: + ndarray[int32_t, ndim=1] xstart, xend, ystart, yend + int32_t nstart, nend, diff + list out_blocs = [], out_blengths = [] + + xstart = self.xstart + xend = self.xend + ystart = self.ystart + yend = self.yend + + while True: + # we are done (or possibly never began) + if self.xi >= self.x.nblocks and self.yi >= self.y.nblocks: + break + elif self.yi >= self.y.nblocks: + # through with y, just pass through x blocks + nstart = xstart[self.xi] + nend = xend[self.xi] + self.xi += 1 + elif self.xi >= self.x.nblocks: + # through with x, just pass through y blocks + nstart = ystart[self.yi] + nend = yend[self.yi] + self.yi += 1 + else: + # find end of new block + if xstart[self.xi] < ystart[self.yi]: + nstart = xstart[self.xi] + nend = self._find_next_block_end(0) + else: + nstart = ystart[self.yi] + nend = self._find_next_block_end(1) + + out_blocs.append(nstart) + out_blengths.append(nend - nstart) + + return BlockIndex(self.x.length, out_blocs, out_blengths) + + cdef int32_t _find_next_block_end(self, bint mode) except -1: + ''' + Wow, this got complicated in a hurry + + mode 0: block started in index x + mode 1: block started in index y + ''' + cdef: + ndarray[int32_t, ndim=1] xstart, xend, ystart, yend + int32_t xi, yi, xnblocks, ynblocks, nend + + if mode != 0 and mode != 1: + raise Exception('Mode must be 0 or 1') + + # so symmetric code will work + if mode == 0: + xstart = self.xstart + xend = self.xend + xi = self.xi + + ystart = self.ystart + yend = self.yend + yi = self.yi + ynblocks = self.y.nblocks + else: + xstart = self.ystart + xend = self.yend + xi = self.yi + + ystart = self.xstart + yend = self.xend + yi = self.xi + ynblocks = self.x.nblocks + + nend = xend[xi] + + # print 'here xi=%d, yi=%d, mode=%d, nend=%d' % (self.xi, self.yi, + # mode, nend) + + # done with y? + if yi == ynblocks: + self._set_current_indices(xi + 1, yi, mode) + return nend + elif nend < ystart[yi]: + # block ends before y block + self._set_current_indices(xi + 1, yi, mode) + return nend + else: + while yi < ynblocks and nend > yend[yi]: + yi += 1 + + self._set_current_indices(xi + 1, yi, mode) + + if yi == ynblocks: + return nend + + if nend < ystart[yi]: + # we're done, return the block end + return nend + else: + # merge blocks, continue searching + # this also catches the case where blocks + return self._find_next_block_end(1 - mode) + + +#------------------------------------------------------------------------------- +# Sparse arithmetic + +ctypedef float64_t (* double_func)(float64_t a, float64_t b) + +cdef inline tuple sparse_nancombine(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex, + double_func op): + # faster to convert to IntIndex + return int_nanop(x, xindex.to_int_index(), + y, yindex.to_int_index(), op) + + # if isinstance(xindex, BlockIndex): + # return block_nanop(x, xindex.to_block_index(), + # y, yindex.to_block_index(), op) + # elif isinstance(xindex, IntIndex): + # return int_nanop(x, xindex.to_int_index(), + # y, yindex.to_int_index(), op) + + +cdef inline tuple sparse_combine(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill, + double_func op): + if isinstance(xindex, BlockIndex): + return block_op(x, xindex.to_block_index(), xfill, + y, yindex.to_block_index(), yfill, op) + elif isinstance(xindex, IntIndex): + return int_op(x, xindex.to_int_index(), xfill, + y, yindex.to_int_index(), yfill, op) + +# NaN-based arithmetic operation-- no handling of fill values +# TODO: faster to convert everything to dense? + +@cython.boundscheck(False) +cdef inline tuple block_nanop(ndarray x_, BlockIndex xindex, + ndarray y_, BlockIndex yindex, + double_func op): + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0, obp = 0 # block positions + Py_ssize_t xblock = 0, yblock = 0, outblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + out_index = xindex.intersect(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + + # I have a feeling this is inefficient + + # walk x + while xindex.locbuf[xblock] + xbp < out_index.locbuf[outblock] + obp: + xbp += 1 + xi += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + # walk y + while yindex.locbuf[yblock] + ybp < out_index.locbuf[outblock] + obp: + ybp += 1 + yi += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + out[out_i] = op(x[xi], y[yi]) + + # advance. strikes me as too complicated + xi += 1 + yi += 1 + + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + obp += 1 + if obp == out_index.lenbuf[outblock]: + outblock += 1 + obp = 0 + + return out, out_index + +@cython.boundscheck(False) +cdef inline tuple int_nanop(ndarray x_, IntIndex xindex, + ndarray y_, IntIndex yindex, + double_func op): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.intersect(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + + # walk x + while xindices[xi] < out_indices[out_i]: + xi += 1 + + # walk y + while yindices[yi] < out_indices[out_i]: + yi += 1 + + out[out_i] = op(x[xi], y[yi]) + + # advance + xi += 1 + yi += 1 + + return out, out_index + + +@cython.boundscheck(False) +cdef inline tuple block_op(ndarray x_, BlockIndex xindex, float64_t xfill, + ndarray y_, BlockIndex yindex, float64_t yfill, + double_func op): + ''' + Binary operator on BlockIndex objects with fill values + ''' + + cdef: + BlockIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + Py_ssize_t xbp = 0, ybp = 0 # block positions + int32_t xloc, yloc + Py_ssize_t xblock = 0, yblock = 0 # block numbers + + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # to suppress Cython warning + x = x_ + y = y_ + + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + # Wow, what a hack job. Need to do something about this + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if yblock == yindex.nblocks: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + continue + + if xblock == xindex.nblocks: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + continue + + yloc = yindex.locbuf[yblock] + ybp + xloc = xindex.locbuf[xblock] + xbp + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = op(x[xi], y[yi]) + xi += 1 + yi += 1 + + # advance both locations + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + elif xloc < yloc: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + + # advance x location + xbp += 1 + if xbp == xindex.lenbuf[xblock]: + xblock += 1 + xbp = 0 + else: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + + # advance y location + ybp += 1 + if ybp == yindex.lenbuf[yblock]: + yblock += 1 + ybp = 0 + + return out, out_index + + +@cython.boundscheck(False) +cdef inline tuple int_op(ndarray x_, IntIndex xindex, float64_t xfill, + ndarray y_, IntIndex yindex, float64_t yfill, + double_func op): + cdef: + IntIndex out_index + Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices + int32_t xloc, yloc + ndarray[int32_t, ndim=1] xindices, yindices, out_indices + ndarray[float64_t, ndim=1] x, y + ndarray[float64_t, ndim=1] out + + # suppress Cython compiler warnings due to inlining + x = x_ + y = y_ + + # need to do this first to know size of result array + out_index = xindex.make_union(yindex) + out = np.empty(out_index.npoints, dtype=np.float64) + + xindices = xindex.indices + yindices = yindex.indices + out_indices = out_index.indices + + # walk the two SparseVectors, adding matched locations... + for out_i from 0 <= out_i < out_index.npoints: + if xi == xindex.npoints: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + continue + + if yi == yindex.npoints: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + continue + + xloc = xindices[xi] + yloc = yindices[yi] + + # each index in the out_index had to come from either x, y, or both + if xloc == yloc: + out[out_i] = op(x[xi], y[yi]) + xi += 1 + yi += 1 + elif xloc < yloc: + # use y fill value + out[out_i] = op(x[xi], yfill) + xi += 1 + else: + # use x fill value + out[out_i] = op(xfill, y[yi]) + yi += 1 + + return out, out_index + +cdef inline float64_t __add(float64_t a, float64_t b): + return a + b + +cdef inline float64_t __sub(float64_t a, float64_t b): + return a - b + +cdef inline float64_t __rsub(float64_t a, float64_t b): + return b - a + +cdef inline float64_t __div(float64_t a, float64_t b): + if b == 0: + if a >= 0: + return INF + else: + return -INF + else: + return a / b + +cdef inline float64_t __rdiv(float64_t a, float64_t b): + return __div(b, a) + +cdef inline float64_t __floordiv(float64_t a, float64_t b): + if b == 0: + if a >= 0: + return INF + else: + return -INF + else: + return a // b + +cdef inline float64_t __rfloordiv(float64_t a, float64_t b): + return __floordiv(b, a) + +cdef inline float64_t __mul(float64_t a, float64_t b): + return a * b +cdef inline float64_t __eq(float64_t a, float64_t b): + return a == b +cdef inline float64_t __ne(float64_t a, float64_t b): + return a != b +cdef inline float64_t __lt(float64_t a, float64_t b): + return a < b +cdef inline float64_t __gt(float64_t a, float64_t b): + return a > b + +cdef inline float64_t __pow(float64_t a, float64_t b): + # NaN + if a != a or b != b: + return NaN + return a ** b + +cdef inline float64_t __rpow(float64_t a, float64_t b): + return __pow(b, a) + + +# This probably needs to be "templated" to achieve maximum performance. +# TODO: quantify performance boost to "templating" + +cpdef sparse_nanadd(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __add) + +cpdef sparse_nansub(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __sub) + +cpdef sparse_nanrsub(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rsub) + +cpdef sparse_nanmul(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __mul) + +cpdef sparse_nandiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __div) + +cpdef sparse_nanrdiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rdiv) + +sparse_nantruediv = sparse_nandiv +sparse_nanrtruediv = sparse_nanrdiv + +cpdef sparse_nanfloordiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __floordiv) + +cpdef sparse_nanrfloordiv(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rfloordiv) + +cpdef sparse_nanpow(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __pow) + +cpdef sparse_nanrpow(ndarray x, SparseIndex xindex, + ndarray y, SparseIndex yindex): + return sparse_nancombine(x, xindex, y, yindex, __rpow) + +cpdef sparse_add(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __add) + +cpdef sparse_sub(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __sub) + +cpdef sparse_rsub(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rsub) + +cpdef sparse_mul(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __mul) + +cpdef sparse_div(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __div) + +cpdef sparse_rdiv(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rdiv) + +sparse_truediv = sparse_div +sparse_rtruediv = sparse_rdiv + +cpdef sparse_floordiv(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __floordiv) + +cpdef sparse_rfloordiv(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rfloordiv) + +cpdef sparse_pow(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __pow) + +cpdef sparse_rpow(ndarray x, SparseIndex xindex, float64_t xfill, + ndarray y, SparseIndex yindex, float64_t yfill): + return sparse_combine(x, xindex, xfill, + y, yindex, yfill, __rpow) + + +#------------------------------------------------------------------------------- +# Indexing operations + +def get_reindexer(ndarray[object, ndim=1] values, dict index_map): + cdef object idx + cdef Py_ssize_t i + cdef Py_ssize_t new_length = len(values) + cdef ndarray[int32_t, ndim=1] indexer + + indexer = np.empty(new_length, dtype=np.int32) + + for i in range(new_length): + idx = values[i] + if idx in index_map: + indexer[i] = index_map[idx] + else: + indexer[i] = -1 + + return indexer + +# def reindex_block(ndarray[float64_t, ndim=1] values, +# BlockIndex sparse_index, +# ndarray[int32_t, ndim=1] indexer): +# cdef: +# Py_ssize_t i, length +# ndarray[float64_t, ndim=1] out + +# out = np.empty(length, dtype=np.float64) + +# for i from 0 <= i < length: +# if indexer[i] == -1: +# pass + + +# cdef class SparseCruncher(object): +# ''' +# Class to acquire float pointer for convenient operations on sparse data +# structures +# ''' +# cdef: +# SparseIndex index +# float64_t* buf + +# def __init__(self, ndarray[float64_t, ndim=1, mode='c'] values, +# SparseIndex index): + +# self.index = index +# self.buf = values.data + + +def reindex_integer(ndarray[float64_t, ndim=1] values, + IntIndex sparse_index, + ndarray[int32_t, ndim=1] indexer): + pass diff --git a/pandas/src/stats.pyx b/pandas/src/stats.pyx new file mode 100644 index 00000000..f4d87f41 --- /dev/null +++ b/pandas/src/stats.pyx @@ -0,0 +1,550 @@ +cdef float64_t FP_ERR = 1e-13 + +cimport util + +cdef: + int TIEBREAK_AVERAGE = 0 + int TIEBREAK_MIN = 1 + int TIEBREAK_MAX = 2 + int TIEBREAK_FIRST = 3 + int TIEBREAK_FIRST_DESCENDING = 4 + +tiebreakers = { + 'average' : TIEBREAK_AVERAGE, + 'min' : TIEBREAK_MIN, + 'max' : TIEBREAK_MAX, + 'first' : TIEBREAK_FIRST +} + + +# ctypedef fused pvalue_t: +# float64_t +# int64_t +# object + +# from cython cimport floating, integral + +cdef _take_2d_float64(ndarray[float64_t, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[float64_t, ndim=2] result + object val + + N, K = ( values).shape + result = np.empty_like(values) + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +cdef _take_2d_int64(ndarray[int64_t, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[int64_t, ndim=2] result + object val + + N, K = ( values).shape + result = np.empty_like(values) + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + +cdef _take_2d_object(ndarray[object, ndim=2] values, + object idx): + cdef: + Py_ssize_t i, j, N, K + ndarray[Py_ssize_t, ndim=2, cast=True] indexer = idx + ndarray[object, ndim=2] result + object val + + N, K = ( values).shape + result = values.copy() + for i in range(N): + for j in range(K): + result[i, j] = values[i, indexer[i, j]] + return result + + +def rank_1d_float64(object in_arr, ties_method='average', ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0 + ndarray[float64_t] sorted_data, ranks, values + ndarray[int64_t] argsorted + float64_t val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + values = np.asarray(in_arr).copy() + + if ascending: + nan_value = np.inf + else: + nan_value = -np.inf + mask = np.isnan(values) + np.putmask(values, mask, nan_value) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + # py2.5/win32 hack, can't pass i8 + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + for i in range(n): + sum_ranks += i + 1 + dups += 1 + val = sorted_data[i] + if val == nan_value: + ranks[argsorted[i]] = nan + continue + if i == n - 1 or fabs(sorted_data[i + 1] - val) > FP_ERR: + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + sum_ranks = dups = 0 + return ranks + + +def rank_1d_int64(object in_arr, ties_method='average', ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0 + ndarray[int64_t] sorted_data, values + ndarray[float64_t] ranks + ndarray[int64_t] argsorted + int64_t val + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + values = np.asarray(in_arr) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + # py2.5/win32 hack, can't pass i8 + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort() + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + for i in range(n): + sum_ranks += i + 1 + dups += 1 + val = sorted_data[i] + if i == n - 1 or fabs(sorted_data[i + 1] - val) > 0: + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = j + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = 2 * i - j - dups + 2 + sum_ranks = dups = 0 + return ranks + + +def rank_2d_float64(object in_arr, axis=0, ties_method='average', + ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0 + ndarray[float64_t, ndim=2] ranks, values + ndarray[int64_t, ndim=2] argsorted + float64_t val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + in_arr = np.asarray(in_arr) + + if axis == 0: + values = in_arr.T.copy() + else: + values = in_arr.copy() + + if ascending: + nan_value = np.inf + else: + nan_value = -np.inf + + np.putmask(values, np.isnan(values), nan_value) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_float64(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = 0 + for j in range(k): + sum_ranks += j + 1 + dups += 1 + val = values[i, j] + if val == nan_value: + ranks[i, argsorted[i, j]] = nan + continue + if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + sum_ranks = dups = 0 + + if axis == 0: + return ranks.T + else: + return ranks + + +def rank_2d_int64(object in_arr, axis=0, ties_method='average', + ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, dups = 0 + ndarray[float64_t, ndim=2] ranks + ndarray[int64_t, ndim=2] argsorted + ndarray[int64_t, ndim=2, cast=True] values + int64_t val + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + if axis == 0: + values = np.asarray(in_arr).T + else: + values = np.asarray(in_arr) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + if tiebreak == TIEBREAK_FIRST: + # need to use a stable sort here + _as = values.argsort(axis=1, kind='mergesort') + if not ascending: + tiebreak = TIEBREAK_FIRST_DESCENDING + else: + _as = values.argsort(1) + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_int64(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = 0 + for j in range(k): + sum_ranks += j + 1 + dups += 1 + val = values[i, j] + if j == k - 1 or fabs(values[i, j + 1] - val) > FP_ERR: + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = z + 1 + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = 2 * j - z - dups + 2 + sum_ranks = dups = 0 + + if axis == 0: + return ranks.T + else: + return ranks + + +def rank_1d_generic(object in_arr, bint retry=1, ties_method='average', + ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, n, dups = 0 + ndarray[float64_t] ranks + ndarray sorted_data, values + ndarray[int64_t] argsorted + object val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + values = np.array(in_arr, copy=True) + + if values.dtype != np.object_: + values = values.astype('O') + + if ascending: + # always greater than everything + nan_value = Infinity() + else: + nan_value = NegInfinity() + + mask = isnullobj(values) + np.putmask(values, mask, nan_value) + + n = len(values) + ranks = np.empty(n, dtype='f8') + + # py2.5/win32 hack, can't pass i8 + try: + _as = values.argsort() + except TypeError: + if not retry: + raise + + valid_locs = (-mask).nonzero()[0] + ranks.put(valid_locs, rank_1d_generic(values.take(valid_locs), 0, + ties_method=ties_method, + ascending=ascending)) + np.putmask(ranks, mask, np.nan) + return ranks + + if not ascending: + _as = _as[::-1] + + sorted_data = values.take(_as) + argsorted = _as.astype('i8') + + for i in range(n): + sum_ranks += i + 1 + dups += 1 + val = util.get_value_at(sorted_data, i) + if val is nan_value: + ranks[argsorted[i]] = nan + continue + if (i == n - 1 or + are_diff(util.get_value_at(sorted_data, i + 1), val)): + if tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + ranks[argsorted[j]] = i + 1 + elif tiebreak == TIEBREAK_FIRST: + raise ValueError('first not supported for non-numeric data') + sum_ranks = dups = 0 + return ranks + +cdef inline are_diff(object left, object right): + try: + return fabs(left - right) > FP_ERR + except TypeError: + return left != right + +_return_false = lambda self, other: False +_return_true = lambda self, other: True + +class Infinity(object): + + __lt__ = _return_false + __le__ = _return_false + __eq__ = _return_false + __ne__ = _return_true + __gt__ = _return_true + __ge__ = _return_true + __cmp__ = _return_false + +class NegInfinity(object): + + __lt__ = _return_true + __le__ = _return_true + __eq__ = _return_false + __ne__ = _return_true + __gt__ = _return_false + __ge__ = _return_false + __cmp__ = _return_true + +def rank_2d_generic(object in_arr, axis=0, ties_method='average', + ascending=True): + """ + Fast NaN-friendly version of scipy.stats.rankdata + """ + + cdef: + Py_ssize_t i, j, z, k, n, infs, dups = 0 + ndarray[float64_t, ndim=2] ranks + ndarray[object, ndim=2] values + ndarray[int64_t, ndim=2] argsorted + object val, nan_value + float64_t sum_ranks = 0 + int tiebreak = 0 + tiebreak = tiebreakers[ties_method] + + in_arr = np.asarray(in_arr) + + if axis == 0: + values = in_arr.T.copy() + else: + values = in_arr.copy() + + if values.dtype != np.object_: + values = values.astype('O') + + if ascending: + # always greater than everything + nan_value = Infinity() + else: + nan_value = NegInfinity() + + mask = isnullobj2d(values) + np.putmask(values, mask, nan_value) + + n, k = ( values).shape + ranks = np.empty((n, k), dtype='f8') + + try: + _as = values.argsort(1) + except TypeError: + values = in_arr + for i in range(len(values)): + ranks[i] = rank_1d_generic(in_arr[i], + ties_method=ties_method, + ascending=ascending) + if axis == 0: + return ranks.T + else: + return ranks + + if not ascending: + _as = _as[:, ::-1] + + values = _take_2d_object(values, _as) + argsorted = _as.astype('i8') + + for i in range(n): + dups = sum_ranks = infs = 0 + for j in range(k): + val = values[i, j] + if val is nan_value: + ranks[i, argsorted[i, j]] = nan + infs += 1 + continue + sum_ranks += (j - infs) + 1 + dups += 1 + if j == k - 1 or are_diff(values[i, j + 1], val): + if tiebreak == TIEBREAK_AVERAGE: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for z in range(j - dups + 1, j + 1): + ranks[i, argsorted[i, z]] = j + 1 + elif tiebreak == TIEBREAK_FIRST: + raise ValueError('first not supported for ' + 'non-numeric data') + sum_ranks = dups = 0 + + if axis == 0: + return ranks.T + else: + return ranks + +# def _take_indexer_2d(ndarray[float64_t, ndim=2] values, +# ndarray[Py_ssize_t, ndim=2, cast=True] indexer): +# cdef: +# Py_ssize_t i, j, N, K +# ndarray[float64_t, ndim=2] result + +# N, K = ( values).shape +# result = np.empty_like(values) +# for i in range(N): +# for j in range(K): +# result[i, j] = values[i, indexer[i, j]] +# return result diff --git a/pandas/src/stdint.h b/pandas/src/stdint.h new file mode 100644 index 00000000..b0fd235a --- /dev/null +++ b/pandas/src/stdint.h @@ -0,0 +1,10 @@ +#ifndef _PANDAS_STDINT_H_ +#define _PANDAS_STDINT_H_ + +#if defined(_MSC_VER) +#include "ms_stdint.h" +#else +#include +#endif + +#endif diff --git a/pandas/src/tseries.pyx b/pandas/src/tseries.pyx new file mode 100644 index 00000000..a66b2193 --- /dev/null +++ b/pandas/src/tseries.pyx @@ -0,0 +1,709 @@ +cimport numpy as np +cimport cython +import numpy as np + +from numpy cimport * +from numpy cimport NPY_INT32 as NPY_int32 +from numpy cimport NPY_INT64 as NPY_int64 +from numpy cimport NPY_FLOAT32 as NPY_float32 +from numpy cimport NPY_FLOAT64 as NPY_float64 + +int32 = np.dtype(np.int32) +int64 = np.dtype(np.int64) +float32 = np.dtype(np.float32) +float64 = np.dtype(np.float64) + +cdef np.int32_t MINint32 = np.iinfo(np.int32).min +cdef np.int64_t MINint64 = np.iinfo(np.int64).min +cdef np.float32_t MINfloat32 = np.NINF +cdef np.float64_t MINfloat64 = np.NINF + +cdef np.int32_t MAXint32 = np.iinfo(np.int32).max +cdef np.int64_t MAXint64 = np.iinfo(np.int64).max +cdef np.float32_t MAXfloat32 = np.inf +cdef np.float64_t MAXfloat64 = np.inf + + +cdef extern from "numpy/arrayobject.h": + cdef enum NPY_TYPES: + NPY_intp "NPY_INTP" + +from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, + PyDict_Contains, PyDict_Keys, + Py_INCREF, PyTuple_SET_ITEM, + PyTuple_SetItem, + PyTuple_New) +from cpython cimport PyFloat_Check +cimport cpython + +isnan = np.isnan +cdef double NaN = np.NaN +cdef double nan = NaN +cdef double NAN = nan + +from datetime import datetime as pydatetime + +# this is our datetime.pxd +from datetime cimport * + +cdef int64_t NPY_NAT = util.get_nat() + +from khash cimport * + +cdef inline int int_max(int a, int b): return a if a >= b else b +cdef inline int int_min(int a, int b): return a if a <= b else b + +ctypedef unsigned char UChar + +cimport util +from util cimport is_array, _checknull, _checknan + +cdef extern from "math.h": + double sqrt(double x) + double fabs(double) + +# import datetime C API +PyDateTime_IMPORT + +# initialize numpy +import_array() +import_ufunc() + +cpdef map_indices_list(list index): + ''' + Produce a dict mapping the values of the input array to their respective + locations. + + Example: + array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1} + + Better to do this with Cython because of the enormous speed boost. + ''' + cdef Py_ssize_t i, length + cdef dict result = {} + + length = len(index) + + for i from 0 <= i < length: + result[index[i]] = i + + return result + + +from libc.stdlib cimport malloc, free + +def ismember(ndarray arr, set values): + ''' + Checks whether + + Parameters + ---------- + arr : ndarray + values : set + + Returns + ------- + ismember : ndarray (boolean dtype) + ''' + cdef: + Py_ssize_t i, n + flatiter it + ndarray[uint8_t] result + object val + + it = PyArray_IterNew(arr) + n = len(arr) + result = np.empty(n, dtype=np.uint8) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + if val in values: + result[i] = 1 + else: + result[i] = 0 + PyArray_ITER_NEXT(it) + + return result.view(np.bool_) + +#---------------------------------------------------------------------- +# datetime / io related + +cdef int _EPOCH_ORD = 719163 + +from datetime import date as pydate + +cdef inline int64_t gmtime(object date): + cdef int y, m, d, h, mn, s, days + + y = PyDateTime_GET_YEAR(date) + m = PyDateTime_GET_MONTH(date) + d = PyDateTime_GET_DAY(date) + h = PyDateTime_DATE_GET_HOUR(date) + mn = PyDateTime_DATE_GET_MINUTE(date) + s = PyDateTime_DATE_GET_SECOND(date) + + days = pydate(y, m, 1).toordinal() - _EPOCH_ORD + d - 1 + return (( (((days * 24 + h) * 60 + mn))) * 60 + s) * 1000 + +cpdef object to_datetime(int64_t timestamp): + return pydatetime.utcfromtimestamp(timestamp / 1000.0) + +cpdef object to_timestamp(object dt): + return gmtime(dt) + +def array_to_timestamp(ndarray[object, ndim=1] arr): + cdef int i, n + cdef ndarray[int64_t, ndim=1] result + + n = len(arr) + result = np.empty(n, dtype=np.int64) + + for i from 0 <= i < n: + result[i] = gmtime(arr[i]) + + return result + +def time64_to_datetime(ndarray[int64_t, ndim=1] arr): + cdef int i, n + cdef ndarray[object, ndim=1] result + + n = len(arr) + result = np.empty(n, dtype=object) + + for i from 0 <= i < n: + result[i] = to_datetime(arr[i]) + + return result + +#---------------------------------------------------------------------- +# isnull / notnull related + +cdef double INF = np.inf +cdef double NEGINF = -INF + +cpdef checknull(object val): + if util.is_float_object(val) or util.is_complex_object(val): + return val != val or val == INF or val == NEGINF + elif util.is_datetime64_object(val): + return get_datetime64_value(val) == NPY_NAT + elif isinstance(val, _NaT): + return True + elif is_array(val): + return False + else: + return util._checknull(val) + +def isscalar(object val): + return np.isscalar(val) or val is None or isinstance(val, _Timestamp) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj(ndarray[object] arr): + cdef Py_ssize_t i, n + cdef object val + cdef ndarray[uint8_t] result + + n = len(arr) + result = np.zeros(n, dtype=np.uint8) + for i from 0 <= i < n: + result[i] = util._checknull(arr[i]) + return result.view(np.bool_) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def isnullobj2d(ndarray[object, ndim=2] arr): + cdef Py_ssize_t i, j, n, m + cdef object val + cdef ndarray[uint8_t, ndim=2] result + + n, m = ( arr).shape + result = np.zeros((n, m), dtype=np.uint8) + for i from 0 <= i < n: + for j from 0 <= j < m: + val = arr[i, j] + if checknull(val): + result[i, j] = 1 + return result.view(np.bool_) + +def list_to_object_array(list obj): + ''' + Convert list to object ndarray. Seriously can't believe I had to write this + function + ''' + cdef: + Py_ssize_t i, n + ndarray[object] arr + + n = len(obj) + arr = np.empty(n, dtype=object) + + for i from 0 <= i < n: + arr[i] = obj[i] + + return arr + + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique(ndarray[object] values): + cdef: + Py_ssize_t i, n = len(values) + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < n: + val = values[i] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple(list arrays): + cdef: + ndarray[object] buf + Py_ssize_t k = len(arrays) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = arrays[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple_list(list lists): + cdef: + list buf + Py_ssize_t k = len(lists) + Py_ssize_t i, j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for i from 0 <= i < k: + buf = lists[i] + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_unique_multiple_list_gen(object gen): + cdef: + list buf + Py_ssize_t j, n + list uniques = [] + dict table = {} + object val, stub = 0 + + for buf in gen: + n = len(buf) + for j from 0 <= j < n: + val = buf[j] + if val not in table: + table[val] = stub + uniques.append(val) + + try: + uniques.sort() + except Exception: + pass + + return uniques + +@cython.wraparound(False) +@cython.boundscheck(False) +def dicts_to_array(list dicts, list columns): + cdef: + Py_ssize_t i, j, k, n + ndarray[object, ndim=2] result + dict row + object col, onan = np.nan + + k = len(columns) + n = len(dicts) + + result = np.empty((n, k), dtype='O') + + for i in range(n): + row = dicts[i] + for j in range(k): + col = columns[j] + if col in row: + result[i, j] = row[col] + else: + result[i, j] = onan + + return result + + +def fast_zip(list ndarrays): + ''' + For zipping multiple ndarrays into an ndarray of tuples + ''' + cdef: + Py_ssize_t i, j, k, n + ndarray[object] result + flatiter it + object val, tup + + k = len(ndarrays) + n = len(ndarrays[0]) + + result = np.empty(n, dtype=object) + + # initialize tuples on first pass + arr = ndarrays[0] + it = PyArray_IterNew(arr) + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + tup = PyTuple_New(k) + + PyTuple_SET_ITEM(tup, 0, val) + Py_INCREF(val) + result[i] = tup + PyArray_ITER_NEXT(it) + + for j in range(1, k): + arr = ndarrays[j] + it = PyArray_IterNew(arr) + if len(arr) != n: + raise ValueError('all arrays must be same length') + + for i in range(n): + val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it)) + PyTuple_SET_ITEM(result[i], j, val) + Py_INCREF(val) + PyArray_ITER_NEXT(it) + + return result + +def get_reverse_indexer(ndarray[int64_t] indexer, Py_ssize_t length): + cdef: + Py_ssize_t i, n = len(indexer) + ndarray[int64_t] rev_indexer + int64_t idx + + rev_indexer = np.empty(length, dtype=np.int64) + rev_indexer.fill(-1) + for i in range(n): + idx = indexer[i] + if idx != -1: + rev_indexer[idx] = i + + return rev_indexer + + +def has_infs_f4(ndarray[float32_t] arr): + cdef: + Py_ssize_t i, n = len(arr) + float32_t inf, neginf, val + + inf = np.inf + neginf = -inf + + for i in range(n): + val = arr[i] + if val == inf or val == neginf: + return True + return False + +def has_infs_f8(ndarray[float64_t] arr): + cdef: + Py_ssize_t i, n = len(arr) + float64_t inf, neginf, val + + inf = np.inf + neginf = -inf + + for i in range(n): + val = arr[i] + if val == inf or val == neginf: + return True + return False + +def convert_timestamps(ndarray values): + cdef: + object val, f, result + dict cache = {} + Py_ssize_t i, n = len(values) + ndarray[object] out + + # for HDFStore, a bit temporary but... + + from datetime import datetime + f = datetime.fromtimestamp + + out = np.empty(n, dtype='O') + + for i in range(n): + val = util.get_value_1d(values, i) + if val in cache: + out[i] = cache[val] + else: + cache[val] = out[i] = f(val) + + return out + +def maybe_indices_to_slice(ndarray[int64_t] indices): + cdef: + Py_ssize_t i, n = len(indices) + + if n == 0: + return indices + + for i in range(1, n): + if indices[i] - indices[i - 1] != 1: + return indices + return slice(indices[0], indices[n - 1] + 1) + + +def maybe_booleans_to_slice(ndarray[uint8_t] mask): + cdef: + Py_ssize_t i, n = len(mask) + Py_ssize_t start, end + bint started = 0, finished = 0 + + for i in range(n): + if mask[i]: + if finished: + return mask.view(np.bool_) + if not started: + started = 1 + start = i + else: + if finished: + continue + + if started: + end = i + finished = 1 + + if not started: + return slice(0, 0) + if not finished: + return slice(start, None) + else: + return slice(start, end) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def scalar_compare(ndarray[object] values, object val, object op): + import operator + cdef: + Py_ssize_t i, n = len(values) + ndarray[uint8_t, cast=True] result + int flag + object x + + if op is operator.lt: + flag = cpython.Py_LT + elif op is operator.le: + flag = cpython.Py_LE + elif op is operator.gt: + flag = cpython.Py_GT + elif op is operator.ge: + flag = cpython.Py_GE + elif op is operator.eq: + flag = cpython.Py_EQ + elif op is operator.ne: + flag = cpython.Py_NE + else: + raise ValueError('Unrecognized operator') + + result = np.empty(n, dtype=bool).view(np.uint8) + + if flag == cpython.Py_NE: + for i in range(n): + x = values[i] + if _checknull(x): + result[i] = True + else: + result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + else: + for i in range(n): + x = values[i] + if _checknull(x): + result[i] = False + else: + result[i] = cpython.PyObject_RichCompareBool(x, val, flag) + + return result.view(bool) + +@cython.wraparound(False) +@cython.boundscheck(False) +def vec_compare(ndarray[object] left, ndarray[object] right, object op): + import operator + cdef: + Py_ssize_t i, n = len(left) + ndarray[uint8_t, cast=True] result + int flag + + if n != len(right): + raise ValueError('Arrays were different lengths: %d vs %d' + % (n, len(right))) + + if op is operator.lt: + flag = cpython.Py_LT + elif op is operator.le: + flag = cpython.Py_LE + elif op is operator.gt: + flag = cpython.Py_GT + elif op is operator.ge: + flag = cpython.Py_GE + elif op is operator.eq: + flag = cpython.Py_EQ + elif op is operator.ne: + flag = cpython.Py_NE + else: + raise ValueError('Unrecognized operator') + + result = np.empty(n, dtype=bool).view(np.uint8) + + if flag == cpython.Py_NE: + for i in range(n): + x = left[i] + y = right[i] + + if _checknull(x) or _checknull(y): + result[i] = True + else: + result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + else: + for i in range(n): + x = left[i] + y = right[i] + + if _checknull(x) or _checknull(y): + result[i] = False + else: + result[i] = cpython.PyObject_RichCompareBool(x, y, flag) + + return result.view(bool) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def scalar_binop(ndarray[object] values, object val, object op): + cdef: + Py_ssize_t i, n = len(values) + ndarray[object] result + object x + + result = np.empty(n, dtype=object) + + for i in range(n): + x = values[i] + if util._checknull(x): + result[i] = x + else: + result[i] = op(x, val) + + return maybe_convert_bool(result) + +@cython.wraparound(False) +@cython.boundscheck(False) +def vec_binop(ndarray[object] left, ndarray[object] right, object op): + cdef: + Py_ssize_t i, n = len(left) + ndarray[object] result + + if n != len(right): + raise ValueError('Arrays were different lengths: %d vs %d' + % (n, len(right))) + + result = np.empty(n, dtype=object) + + for i in range(n): + x = left[i] + y = right[i] + try: + result[i] = op(x, y) + except TypeError: + if util._checknull(x): + result[i] = x + elif util._checknull(y): + result[i] = y + else: + raise + + return maybe_convert_bool(result) + + +def value_count_int64(ndarray[int64_t] values): + cdef: + Py_ssize_t i, n = len(values) + kh_int64_t *table + int ret = 0 + list uniques = [] + + table = kh_init_int64() + kh_resize_int64(table, n) + + for i in range(n): + val = values[i] + k = kh_get_int64(table, val) + if k != table.n_buckets: + table.vals[k] += 1 + else: + k = kh_put_int64(table, val, &ret) + table.vals[k] = 1 + + # for (k = kh_begin(h); k != kh_end(h); ++k) + # if (kh_exist(h, k)) kh_value(h, k) = 1; + i = 0 + result_keys = np.empty(table.n_occupied, dtype=np.int64) + result_counts = np.zeros(table.n_occupied, dtype=np.int64) + for k in range(table.n_buckets): + if kh_exist_int64(table, k): + result_keys[i] = table.keys[k] + result_counts[i] = table.vals[k] + i += 1 + kh_destroy_int64(table) + + return result_keys, result_counts + +include "hashtable.pyx" +include "datetime.pyx" +include "skiplist.pyx" +include "groupby.pyx" +include "moments.pyx" +include "reindex.pyx" +include "reduce.pyx" +include "stats.pyx" +include "properties.pyx" +include "inference.pyx" +include "join.pyx" +include "engines.pyx" diff --git a/pandas/src/util.pxd b/pandas/src/util.pxd new file mode 100644 index 00000000..694fd920 --- /dev/null +++ b/pandas/src/util.pxd @@ -0,0 +1,64 @@ +from numpy cimport ndarray +cimport numpy as cnp + +cdef extern from "numpy_helper.h": + inline int is_integer_object(object) + inline int is_float_object(object) + inline int is_complex_object(object) + inline int is_bool_object(object) + inline int is_string_object(object) + inline int is_datetime64_object(object) + inline int assign_value_1d(ndarray, Py_ssize_t, object) except -1 + inline cnp.int64_t get_nat() + inline object get_value_1d(ndarray, Py_ssize_t) + inline char *get_c_string(object) + inline object floatify(object) + inline object char_to_string(char*) + +cdef inline object get_value_at(ndarray arr, object loc): + cdef: + Py_ssize_t i, sz + void* data_ptr + if is_float_object(loc): + casted = int(loc) + if casted == loc: + loc = casted + i = loc + sz = cnp.PyArray_SIZE(arr) + + if i < 0 and sz > 0: + i += sz + elif i >= sz or sz == 0: + raise IndexError('index out of bounds') + + return get_value_1d(arr, i) + +cdef inline set_value_at(ndarray arr, object loc, object value): + cdef: + Py_ssize_t i, sz + if is_float_object(loc): + casted = int(loc) + if casted == loc: + loc = casted + i = loc + sz = cnp.PyArray_SIZE(arr) + + if i < 0: + i += sz + elif i >= sz: + raise IndexError('index out of bounds') + + assign_value_1d(arr, i, value) + +cdef inline int is_contiguous(ndarray arr): + return cnp.PyArray_CHKFLAGS(arr, cnp.NPY_C_CONTIGUOUS) + +cdef inline is_array(object o): + return cnp.PyArray_Check(o) + + +cdef inline bint _checknull(object val): + return not cnp.PyArray_Check(val) and (val is None or val != val) + +cdef inline bint _checknan(object val): + return not cnp.PyArray_Check(val) and val != val diff --git a/pandas/stats/__init__.py b/pandas/stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/stats/api.py b/pandas/stats/api.py new file mode 100644 index 00000000..3732f9ed --- /dev/null +++ b/pandas/stats/api.py @@ -0,0 +1,9 @@ +""" +Common namespace of statistical functions +""" + +# pylint: disable-msg=W0611,W0614,W0401 + +from pandas.stats.moments import * +from pandas.stats.interface import ols +from pandas.stats.fama_macbeth import fama_macbeth diff --git a/pandas/stats/common.py b/pandas/stats/common.py new file mode 100644 index 00000000..492a7a76 --- /dev/null +++ b/pandas/stats/common.py @@ -0,0 +1,49 @@ +def _get_cluster_type(cluster_type): + cluster_type = _WINDOW_TYPES.get(cluster_type, cluster_type) + if cluster_type is None: + return cluster_type + + cluster_type_up = cluster_type.upper() + + if cluster_type_up == 'ENTITY': + return 'entity' + elif cluster_type_up == 'TIME': + return 'time' + else: # pragma: no cover + raise Exception('Unrecognized cluster type: %s' % cluster_type) + +_CLUSTER_TYPES = { + 0 : 'time', + 1 : 'entity' +} + +_WINDOW_TYPES = { + 0 : 'full_sample', + 1 : 'rolling', + 2 : 'expanding' +} + + +def _get_window_type(window_type): + window_type = _WINDOW_TYPES.get(window_type, window_type) + window_type_up = window_type.upper() + + if window_type_up in ('FULL SAMPLE', 'FULL_SAMPLE'): + return 'full_sample' + elif window_type_up == 'ROLLING': + return 'rolling' + elif window_type_up == 'EXPANDING': + return 'expanding' + else: # pragma: no cover + raise Exception('Unrecognized window type: %s' % window_type) + +def banner(text, width=80): + """ + + """ + toFill = width - len(text) + + left = toFill // 2 + right = toFill - left + + return '%s%s%s' % ('-' * left, text, '-' * right) diff --git a/pandas/stats/fama_macbeth.py b/pandas/stats/fama_macbeth.py new file mode 100644 index 00000000..586642f8 --- /dev/null +++ b/pandas/stats/fama_macbeth.py @@ -0,0 +1,221 @@ +from pandas.util.py3compat import StringIO + +import numpy as np + +from pandas.core.api import Series, DataFrame +import pandas.stats.common as common +from pandas.util.decorators import cache_readonly + +def fama_macbeth(**kwargs): + """Runs Fama-MacBeth regression. + + Parameters + ---------- + Takes the same arguments as a panel OLS, in addition to: + + nw_lags_beta: int + Newey-West adjusts the betas by the given lags + """ + window_type = kwargs.get('window_type') + if window_type is None: + klass = FamaMacBeth + else: + klass = MovingFamaMacBeth + + return klass(**kwargs) + +class FamaMacBeth(object): + def __init__(self, y, x, intercept=True, nw_lags=None, + nw_lags_beta=None, + entity_effects=False, time_effects=False, x_effects=None, + cluster=None, dropped_dummies={}, verbose=False): + self._nw_lags_beta = nw_lags_beta + + from pandas.stats.plm import MovingPanelOLS + self._ols_result = MovingPanelOLS( + y=y, x=x, window_type='rolling', window=1, + intercept=intercept, + nw_lags=nw_lags, entity_effects=entity_effects, + time_effects=time_effects, x_effects=x_effects, cluster=cluster, + dropped_dummies=dropped_dummies, verbose=verbose) + + self._cols = self._ols_result._x.columns + + @cache_readonly + def _beta_raw(self): + return self._ols_result._beta_raw + + @cache_readonly + def _stats(self): + return _calc_t_stat(self._beta_raw, self._nw_lags_beta) + + @cache_readonly + def _mean_beta_raw(self): + return self._stats[0] + + @cache_readonly + def _std_beta_raw(self): + return self._stats[1] + + @cache_readonly + def _t_stat_raw(self): + return self._stats[2] + + def _make_result(self, result): + return Series(result, index=self._cols) + + @cache_readonly + def mean_beta(self): + return self._make_result(self._mean_beta_raw) + + @cache_readonly + def std_beta(self): + return self._make_result(self._std_beta_raw) + + @cache_readonly + def t_stat(self): + return self._make_result(self._t_stat_raw) + + @cache_readonly + def _results(self): + return { + 'mean_beta' : self._mean_beta_raw, + 'std_beta' : self._std_beta_raw, + 't_stat' : self._t_stat_raw, + } + + @cache_readonly + def _coef_table(self): + buffer = StringIO() + buffer.write('%13s %13s %13s %13s %13s %13s\n' % + ('Variable','Beta', 'Std Err','t-stat','CI 2.5%','CI 97.5%')) + template = '%13s %13.4f %13.4f %13.2f %13.4f %13.4f\n' + + for i, name in enumerate(self._cols): + if i and not (i % 5): + buffer.write('\n' + common.banner('')) + + mean_beta = self._results['mean_beta'][i] + std_beta = self._results['std_beta'][i] + t_stat = self._results['t_stat'][i] + ci1 = mean_beta - 1.96 * std_beta + ci2 = mean_beta + 1.96 * std_beta + + values = '(%s)' % name, mean_beta, std_beta, t_stat, ci1, ci2 + + buffer.write(template % values) + + if self._nw_lags_beta is not None: + buffer.write('\n') + buffer.write('*** The Std Err, t-stat are Newey-West ' + 'adjusted with Lags %5d\n' % self._nw_lags_beta) + + return buffer.getvalue() + + def __repr__(self): + return self.summary + + @cache_readonly + def summary(self): + template = """ +----------------------Summary of Fama-MacBeth Analysis------------------------- + +Formula: Y ~ %(formulaRHS)s +# betas : %(nu)3d + +----------------------Summary of Estimated Coefficients------------------------ +%(coefTable)s +--------------------------------End of Summary--------------------------------- +""" + params = { + 'formulaRHS' : ' + '.join(self._cols), + 'nu' : len(self._beta_raw), + 'coefTable' : self._coef_table, + } + + return template % params + +class MovingFamaMacBeth(FamaMacBeth): + def __init__(self, y, x, window_type='rolling', window=10, + intercept=True, nw_lags=None, nw_lags_beta=None, + entity_effects=False, time_effects=False, x_effects=None, + cluster=None, dropped_dummies={}, verbose=False): + self._window_type = common._get_window_type(window_type) + self._window = window + + FamaMacBeth.__init__( + self, y=y, x=x, intercept=intercept, + nw_lags=nw_lags, nw_lags_beta=nw_lags_beta, + entity_effects=entity_effects, time_effects=time_effects, + x_effects=x_effects, cluster=cluster, + dropped_dummies=dropped_dummies, verbose=verbose) + + self._index = self._ols_result._index + self._T = len(self._index) + + @property + def _is_rolling(self): + return self._window_type == 'rolling' + + def _calc_stats(self): + mean_betas = [] + std_betas = [] + t_stats = [] + + # XXX + + mask = self._ols_result._rolling_ols_call[2] + obs_total = mask.astype(int).cumsum() + + start = self._window - 1 + betas = self._beta_raw + for i in xrange(start, self._T): + if self._is_rolling: + begin = i - start + else: + begin = 0 + + B = betas[max(obs_total[begin] - 1, 0) : obs_total[i]] + mean_beta, std_beta, t_stat = _calc_t_stat(B, self._nw_lags_beta) + mean_betas.append(mean_beta) + std_betas.append(std_beta) + t_stats.append(t_stat) + + return np.array([mean_betas, std_betas, t_stats]) + + _stats = cache_readonly(_calc_stats) + + def _make_result(self, result): + return DataFrame(result, index=self._result_index, columns=self._cols) + + @cache_readonly + def _result_index(self): + mask = self._ols_result._rolling_ols_call[2] + # HACK XXX + return self._index[mask.cumsum() >= self._window] + + @cache_readonly + def _results(self): + return { + 'mean_beta' : self._mean_beta_raw[-1], + 'std_beta' : self._std_beta_raw[-1], + 't_stat' : self._t_stat_raw[-1], + } + +def _calc_t_stat(beta, nw_lags_beta): + N = len(beta) + B = beta - beta.mean(0) + C = np.dot(B.T, B) / N + + if nw_lags_beta is not None: + for i in xrange(nw_lags_beta + 1): + + cov = np.dot(B[i:].T, B[:(N - i)]) / N + weight = i / (nw_lags_beta + 1) + C += 2 * (1 - weight) * cov + + mean_beta = beta.mean(0) + std_beta = np.sqrt(np.diag(C)) / np.sqrt(N) + t_stat = mean_beta / std_beta + + return mean_beta, std_beta, t_stat diff --git a/pandas/stats/interface.py b/pandas/stats/interface.py new file mode 100644 index 00000000..603d3b82 --- /dev/null +++ b/pandas/stats/interface.py @@ -0,0 +1,134 @@ +from pandas.core.api import Series, DataFrame, Panel, MultiIndex +from pandas.stats.ols import OLS, MovingOLS +from pandas.stats.plm import PanelOLS, MovingPanelOLS, NonPooledPanelOLS +import pandas.stats.common as common + +def ols(**kwargs): + """Returns the appropriate OLS object depending on whether you need + simple or panel OLS, and a full-sample or rolling/expanding OLS. + + Will be a normal linear regression or a (pooled) panel regression depending + on the type of the inputs: + + y : Series, x : DataFrame -> OLS + y : Series, x : dict of DataFrame -> OLS + y : DataFrame, x : DataFrame -> PanelOLS + y : DataFrame, x : dict of DataFrame/Panel -> PanelOLS + y : Series with MultiIndex, x : Panel/DataFrame + MultiIndex -> PanelOLS + + Parameters + ---------- + y: Series or DataFrame + See above for types + x: Series, DataFrame, dict of Series, dict of DataFrame, Panel + weights : Series or ndarray + The weights are presumed to be (proportional to) the inverse of the + variance of the observations. That is, if the variables are to be + transformed by 1/sqrt(W) you must supply weights = 1/W + intercept: bool + True if you want an intercept. Defaults to True. + nw_lags: None or int + Number of Newey-West lags. Defaults to None. + nw_overlap: bool + Whether there are overlaps in the NW lags. Defaults to False. + window_type: {'full sample', 'rolling', 'expanding'} + 'full sample' by default + window: int + size of window (for rolling/expanding OLS). If window passed and no + explicit window_type, 'rolling" will be used as the window_type + + Panel OLS options: + pool: bool + Whether to run pooled panel regression. Defaults to true. + entity_effects: bool + Whether to account for entity fixed effects. Defaults to false. + time_effects: bool + Whether to account for time fixed effects. Defaults to false. + x_effects: list + List of x's to account for fixed effects. Defaults to none. + dropped_dummies: dict + Key is the name of the variable for the fixed effect. + Value is the value of that variable for which we drop the dummy. + + For entity fixed effects, key equals 'entity'. + + By default, the first dummy is dropped if no dummy is specified. + cluster: {'time', 'entity'} + cluster variances + + Examples + -------- + # Run simple OLS. + result = ols(y=y, x=x) + + # Run rolling simple OLS with window of size 10. + result = ols(y=y, x=x, window_type='rolling', window=10) + print result.beta + + result = ols(y=y, x=x, nw_lags=1) + + # Set up LHS and RHS for data across all items + y = A + x = {'B' : B, 'C' : C} + + # Run panel OLS. + result = ols(y=y, x=x) + + # Run expanding panel OLS with window 10 and entity clustering. + result = ols(y=y, x=x, cluster='entity', window_type='expanding', window=10) + + Returns + ------- + The appropriate OLS object, which allows you to obtain betas and various + statistics, such as std err, t-stat, etc. + """ + pool = kwargs.get('pool') + if 'pool' in kwargs: + del kwargs['pool'] + + window_type = kwargs.get('window_type') + window = kwargs.get('window') + + if window_type is None: + if window is None: + window_type = 'full_sample' + else: + window_type = 'rolling' + else: + window_type = common._get_window_type(window_type) + + if window_type != 'full_sample': + kwargs['window_type'] = common._get_window_type(window_type) + + y = kwargs.get('y') + x = kwargs.get('x') + + panel = False + if isinstance(y, DataFrame) or (isinstance(y, Series) and + isinstance(y.index, MultiIndex)): + panel = True + if isinstance(x, Panel): + panel = True + + if window_type == 'full_sample': + for rolling_field in ('window_type', 'window', 'min_periods'): + if rolling_field in kwargs: + del kwargs[rolling_field] + + if panel: + if pool == False: + klass = NonPooledPanelOLS + else: + klass = PanelOLS + else: + klass = OLS + else: + if panel: + if pool == False: + klass = NonPooledPanelOLS + else: + klass = MovingPanelOLS + else: + klass = MovingOLS + + return klass(**kwargs) diff --git a/pandas/stats/math.py b/pandas/stats/math.py new file mode 100644 index 00000000..c0484354 --- /dev/null +++ b/pandas/stats/math.py @@ -0,0 +1,123 @@ +# pylint: disable-msg=E1103 +# pylint: disable-msg=W0212 + +from __future__ import division + +import numpy as np +import numpy.linalg as linalg + +def rank(X, cond=1.0e-12): + """ + Return the rank of a matrix X based on its generalized inverse, + not the SVD. + """ + X = np.asarray(X) + if len(X.shape) == 2: + import scipy.linalg as SL + D = SL.svdvals(X) + result = np.add.reduce(np.greater(D / D.max(), cond)) + return int(result.astype(np.int32)) + else: + return int(not np.alltrue(np.equal(X, 0.))) + +def solve(a, b): + """Returns the solution of A X = B.""" + try: + return linalg.solve(a, b) + except linalg.LinAlgError: + return np.dot(linalg.pinv(a), b) + +def inv(a): + """Returns the inverse of A.""" + try: + return np.linalg.inv(a) + except linalg.LinAlgError: + return np.linalg.pinv(a) + +def is_psd(m): + eigvals = linalg.eigvals(m) + return np.isreal(eigvals).all() and (eigvals >= 0).all() + +def newey_west(m, max_lags, nobs, df, nw_overlap=False): + """ + Compute Newey-West adjusted covariance matrix, taking into account + specified number of leads / lags + + Parameters + ---------- + m: (N x K) + max_lags: int + nobs: int + Number of observations in model + df: int + Degrees of freedom in explanatory variables + nw_overlap: boolean + + Returns + ------- + ndarray (K x K) + + Reference + --------- + Newey, W. K. & West, K. D. (1987) A Simple, Positive + Semi-definite, Heteroskedasticity and Autocorrelation Consistent + Covariance Matrix, Econometrica, vol. 55(3), 703-708 + """ + Xeps = np.dot(m.T, m) + for lag in xrange(1, max_lags + 1): + auto_cov = np.dot(m[:-lag].T, m[lag:]) + weight = lag / (max_lags + 1) + if nw_overlap: + weight = 0 + bb = auto_cov + auto_cov.T + dd = (1 - weight) * bb + Xeps += dd + + Xeps *= nobs / (nobs - df) + + if nw_overlap and not is_psd(Xeps): + new_max_lags = int(np.ceil(max_lags * 1.5)) +# print ('nw_overlap is True and newey_west generated a non positive ' +# 'semidefinite matrix, so using newey_west with max_lags of %d.' +# % new_max_lags) + return newey_west(m, new_max_lags, nobs, df) + + return Xeps + +def calc_F(R, r, beta, var_beta, nobs, df): + """ + Computes the standard F-test statistic for linear restriction + hypothesis testing + + Parameters + ---------- + R: ndarray (N x N) + Restriction matrix + r: ndarray (N x 1) + Restriction vector + beta: ndarray (N x 1) + Estimated model coefficients + var_beta: ndarray (N x N) + Variance covariance matrix of regressors + nobs: int + Number of observations in model + df: int + Model degrees of freedom + + Returns + ------- + F value, (q, df_resid), p value + """ + from scipy.stats import f + + hyp = np.dot(R, beta.reshape(len(beta), 1)) - r + RSR = np.dot(R, np.dot(var_beta, R.T)) + + q = len(r) + + F = np.dot(hyp.T, np.dot(inv(RSR), hyp)).squeeze() / q + + p_value = 1 - f.cdf(F, q, nobs - df) + + return F, (q, nobs - df), p_value + diff --git a/pandas/stats/misc.py b/pandas/stats/misc.py new file mode 100644 index 00000000..7e5419b7 --- /dev/null +++ b/pandas/stats/misc.py @@ -0,0 +1,289 @@ +from numpy import NaN +import numpy as np + +from pandas.core.api import Series, DataFrame, isnull, notnull +from pandas.core.series import remove_na + +from pandas.tools.tile import quantileTS + +def zscore(series): + return (series - series.mean()) / np.std(series, ddof = 0) + + +def correl_ts(frame1, frame2): + """ + Pairwise correlation of columns of two DataFrame objects + + Parameters + ---------- + + Returns + ------- + y : Series + """ + results = {} + for col, series in frame1.iteritems(): + if col in frame2: + other = frame2[col] + + idx1 = series.valid().index + idx2 = other.valid().index + + common_index = idx1.intersection(idx2) + + seriesStand = zscore(series.reindex(common_index)) + otherStand = zscore(other.reindex(common_index)) + results[col] = (seriesStand * otherStand).mean() + + return Series(results) + +def correl_xs(frame1, frame2): + return correl_ts(frame1.T, frame2.T) + + +def percentileRank(frame, column=None, kind='mean'): + """ + Return score at percentile for each point in time (cross-section) + + Parameters + ---------- + frame: DataFrame + column: string or Series, optional + Column name or specific Series to compute percentiles for. + If not provided, percentiles are computed for all values at each + point in time. Note that this can take a LONG time. + kind: {'rank', 'weak', 'strict', 'mean'}, optional + This optional parameter specifies the interpretation of the + resulting score: + + - "rank": Average percentage ranking of score. In case of + multiple matches, average the percentage rankings of + all matching scores. + - "weak": This kind corresponds to the definition of a cumulative + distribution function. A percentileofscore of 80% + means that 80% of values are less than or equal + to the provided score. + - "strict": Similar to "weak", except that only values that are + strictly less than the given score are counted. + - "mean": The average of the "weak" and "strict" scores, often used in + testing. See + + http://en.wikipedia.org/wiki/Percentile_rank + + Returns + ------- + TimeSeries or DataFrame, depending on input + """ + from pandas.compat.scipy import percentileofscore + fun = lambda xs, score: percentileofscore(remove_na(xs), + score, kind=kind) + + results = {} + framet = frame.T + if column is not None: + if isinstance(column, Series): + for date, xs in frame.T.iteritems(): + results[date] = fun(xs, column.get(date, NaN)) + else: + for date, xs in frame.T.iteritems(): + results[date] = fun(xs, xs[column]) + results = Series(results) + else: + for column in frame.columns: + for date, xs in framet.iteritems(): + results.setdefault(date, {})[column] = fun(xs, xs[column]) + results = DataFrame(results).T + return results + + +def bucket(series, k, by=None): + """ + Produce DataFrame representing quantiles of a Series + + Parameters + ---------- + series : Series + k : int + number of quantiles + by : Series or same-length array + bucket by value + + Returns + ------- + DataFrame + """ + if by is None: + by = series + else: + by = by.reindex(series.index) + + split = _split_quantile(by, k) + mat = np.empty((len(series), k), dtype=float) * np.NaN + + for i, v in enumerate(split): + mat[:, i][v] = series.take(v) + + return DataFrame(mat, index=series.index, columns=np.arange(k) + 1) + +def _split_quantile(arr, k): + arr = np.asarray(arr) + mask = np.isfinite(arr) + order = arr[mask].argsort() + n = len(arr) + + return np.array_split(np.arange(n)[mask].take(order), k) + +def bucketcat(series, cats): + """ + Produce DataFrame representing quantiles of a Series + + Parameters + ---------- + series : Series + cat : Series or same-length array + bucket by category; mutually exxlusive with 'by' + + Returns + ------- + DataFrame + """ + if not isinstance(series, Series): + series = Series(series, index=np.arange(len(series))) + + cats = np.asarray(cats) + + unique_labels = np.unique(cats) + unique_labels = unique_labels[com.notnull(unique_labels)] + + # group by + data = {} + + for label in unique_labels: + data[label] = series[cats == label] + + return DataFrame(data, columns=unique_labels) + +def bucketpanel(series, bins=None, by=None, cat=None): + """ + Bucket data by two Series to create summary panel + + Parameters + ---------- + series : Series + bins : tuple (length-2) + e.g. (2, 2) + by : tuple of Series + bucket by value + cat : tuple of Series + bucket by category; mutually exxlusive with 'by' + + Returns + ------- + DataFrame + """ + use_by = by is not None + use_cat = cat is not None + + if use_by and use_cat: + raise Exception('must specify by or cat, but not both') + elif use_by: + if len(by) != 2: + raise Exception('must provide two bucketing series') + + xby, yby = by + xbins, ybins = bins + + return _bucketpanel_by(series, xby, yby, xbins, ybins) + + elif use_cat: + xcat, ycat = cat + return _bucketpanel_cat(series, xcat, ycat) + else: + raise Exception('must specify either values or categories to bucket by') + +def _bucketpanel_by(series, xby, yby, xbins, ybins): + xby = xby.reindex(series.index) + yby = yby.reindex(series.index) + + xlabels = _bucket_labels(xby.reindex(series.index), xbins) + ylabels = _bucket_labels(yby.reindex(series.index), ybins) + + labels = _uniquify(xlabels, ylabels, xbins, ybins) + + mask = com.isnull(labels) + labels[mask] = -1 + + unique_labels = np.unique(labels) + bucketed = bucketcat(series, labels) + + _ulist = list(labels) + index_map = dict((x, _ulist.index(x)) for x in unique_labels) + + def relabel(key): + pos = index_map[key] + + xlab = xlabels[pos] + ylab = ylabels[pos] + + return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL', + int(ylab) if com.notnull(ylab) else 'NULL') + + return bucketed.rename(columns=relabel) + +def _bucketpanel_cat(series, xcat, ycat): + xlabels, xmapping = _intern(xcat) + ylabels, ymapping = _intern(ycat) + + shift = 10 ** (np.ceil(np.log10(ylabels.max()))) + labels = xlabels * shift + ylabels + + sorter = labels.argsort() + sorted_labels = labels.take(sorter) + sorted_xlabels = xlabels.take(sorter) + sorted_ylabels = ylabels.take(sorter) + + unique_labels = np.unique(labels) + unique_labels = unique_labels[com.notnull(unique_labels)] + + locs = sorted_labels.searchsorted(unique_labels) + xkeys = sorted_xlabels.take(locs) + ykeys = sorted_ylabels.take(locs) + + stringified = ['(%s, %s)' % arg + for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))] + + result = bucketcat(series, labels) + result.columns = stringified + + return result + +def _intern(values): + # assumed no NaN values + values = np.asarray(values) + + uniqued = np.unique(values) + labels = uniqued.searchsorted(values) + return labels, uniqued + + +def _uniquify(xlabels, ylabels, xbins, ybins): + # encode the stuff, create unique label + shifter = 10 ** max(xbins, ybins) + _xpiece = xlabels * shifter + _ypiece = ylabels + + return _xpiece + _ypiece + +def _bucket_labels(series, k): + arr = np.asarray(series) + mask = np.isfinite(arr) + order = arr[mask].argsort() + n = len(series) + + split = np.array_split(np.arange(n)[mask].take(order), k) + + mat = np.empty(n, dtype=float) * np.NaN + for i, v in enumerate(split): + mat[v] = i + + return mat + 1 diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py new file mode 100644 index 00000000..a638fe01 --- /dev/null +++ b/pandas/stats/moments.py @@ -0,0 +1,462 @@ +""" +Provides rolling statistical moments and related descriptive +statistics implemented in Cython +""" +from __future__ import division + +from functools import wraps + +from numpy import NaN +import numpy as np + +from pandas.core.api import DataFrame, Series, notnull +import pandas.lib as _tseries + +from pandas.util.decorators import Substitution, Appender + +__all__ = ['rolling_count', 'rolling_max', 'rolling_min', + 'rolling_sum', 'rolling_mean', 'rolling_std', 'rolling_cov', + 'rolling_corr', 'rolling_var', 'rolling_skew', 'rolling_kurt', + 'rolling_quantile', 'rolling_median', 'rolling_apply', + 'rolling_corr_pairwise', + 'ewma', 'ewmvar', 'ewmstd', 'ewmvol', 'ewmcorr', 'ewmcov'] + +#------------------------------------------------------------------------------- +# Docs + +_doc_template = """ +%s + +Parameters +---------- +%s +window : Number of observations used for calculating statistic +min_periods : int + Minimum number of observations in window required to have a value +freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic + +Returns +------- +%s +""" + + +_ewm_doc = r"""%s + +Parameters +---------- +%s +com : float. optional + Center of mass: \alpha = com / (1 + com), +span : float, optional + Specify decay in terms of span, \alpha = 2 / (span + 1) +min_periods : int, default 0 + Number of observations in sample to require (only affects + beginning) +freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic +%s +Notes +----- +Either center of mass or span must be specified + +EWMA is sometimes specified using a "span" parameter s, we have have that the +decay parameter \alpha is related to the span as :math:`\alpha = 1 - 2 / (s + 1) += c / (1 + c)` + +where c is the center of mass. Given a span, the associated center of mass is +:math:`c = (s - 1) / 2` + +So a "20-day EWMA" would have center 9.5. + +Returns +------- +y : type of input argument +""" + +_type_of_input = "y : type of input argument" + +_flex_retval = """y : type depends on inputs + DataFrame / DataFrame -> DataFrame (matches on columns) + DataFrame / Series -> Computes result for each column + Series / Series -> Series""" + +_unary_arg = "arg : Series, DataFrame" + +_binary_arg_flex = """arg1 : Series, DataFrame, or ndarray +arg2 : Series, DataFrame, or ndarray""" + +_binary_arg = """arg1 : Series, DataFrame, or ndarray +arg2 : Series, DataFrame, or ndarray""" + +_bias_doc = r"""bias : boolean, default False + Use a standard estimation bias correction +""" +def rolling_count(arg, window, freq=None, time_rule=None): + """ + Rolling count of number of non-NaN observations inside provided window. + + Parameters + ---------- + arg : DataFrame or numpy ndarray-like + window : Number of observations used for calculating statistic + freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic + + Returns + ------- + rolling_count : type of caller + """ + arg = _conv_timerule(arg, freq, time_rule) + window = min(window, len(arg)) + + return_hook, values = _process_data_structure(arg, kill_inf=False) + + converted = np.isfinite(values).astype(float) + result = rolling_sum(converted, window, min_periods=1, + time_rule=time_rule) + + # putmask here? + result[np.isnan(result)] = 0 + + return return_hook(result) + +@Substitution("Unbiased moving covariance", _binary_arg_flex, _flex_retval) +@Appender(_doc_template) +def rolling_cov(arg1, arg2, window, min_periods=None, time_rule=None): + def _get_cov(X, Y): + mean = lambda x: rolling_mean(x, window, min_periods, time_rule) + count = rolling_count(X + Y, window, time_rule) + bias_adj = count / (count - 1) + return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj + return _flex_binary_moment(arg1, arg2, _get_cov) + +@Substitution("Moving sample correlation", _binary_arg_flex, _flex_retval) +@Appender(_doc_template) +def rolling_corr(arg1, arg2, window, min_periods=None, time_rule=None): + def _get_corr(a, b): + num = rolling_cov(a, b, window, min_periods, time_rule) + den = (rolling_std(a, window, min_periods, time_rule) * + rolling_std(b, window, min_periods, time_rule)) + return num / den + return _flex_binary_moment(arg1, arg2, _get_corr) + +def _flex_binary_moment(arg1, arg2, f): + if isinstance(arg1, np.ndarray) and isinstance(arg2, np.ndarray): + X, Y = _prep_binary(arg1, arg2) + return f(X, Y) + elif isinstance(arg1, DataFrame): + results = {} + if isinstance(arg2, DataFrame): + X, Y = arg1.align(arg2, join='outer') + X = X + 0 * Y + Y = Y + 0 * X + res_columns = arg1.columns.union(arg2.columns) + for col in res_columns: + if col in X and col in Y: + results[col] = f(X[col], Y[col]) + else: + res_columns = arg1.columns + X, Y = arg1.align(arg2, axis=0, join='outer') + results = {} + + for col in res_columns: + results[col] = f(X[col], Y) + + return DataFrame(results, index=X.index, columns=res_columns) + else: + return _flex_binary_moment(arg2, arg1, f) + +def rolling_corr_pairwise(df, window, min_periods=None): + """ + Computes pairwise rolling correlation matrices as Panel whose items are + dates + + Parameters + ---------- + df : DataFrame + window : int + min_periods : int, default None + + Returns + ------- + correls : Panel + """ + from pandas import Panel + from collections import defaultdict + + all_results = defaultdict(dict) + + for i, k1 in enumerate(df.columns): + for k2 in df.columns[i:]: + corr = rolling_corr(df[k1], df[k2], window, + min_periods=min_periods) + all_results[k1][k2] = corr + all_results[k2][k1] = corr + + return Panel.from_dict(all_results).swapaxes('items', 'major') + +def _rolling_moment(arg, window, func, minp, axis=0, freq=None, + time_rule=None, **kwargs): + """ + Rolling statistical measure using supplied function. Designed to be + used with passed-in Cython array-based functions. + + Parameters + ---------- + arg : DataFrame or numpy ndarray-like + window : Number of observations used for calculating statistic + func : Cython function to compute rolling statistic on raw series + minp : int + Minimum number of observations required to have a value + axis : int, default 0 + freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic + + Returns + ------- + y : type of input + """ + arg = _conv_timerule(arg, freq, time_rule) + calc = lambda x: func(x, window, minp=minp, **kwargs) + return_hook, values = _process_data_structure(arg) + # actually calculate the moment. Faster way to do this? + result = np.apply_along_axis(calc, axis, values) + + return return_hook(result) + +def _process_data_structure(arg, kill_inf=True): + if isinstance(arg, DataFrame): + return_hook = lambda v: type(arg)(v, index=arg.index, + columns=arg.columns) + values = arg.values + elif isinstance(arg, Series): + values = arg.values + return_hook = lambda v: Series(v, arg.index) + else: + return_hook = lambda v: v + values = arg + + if not issubclass(values.dtype.type, float): + values = values.astype(float) + + if kill_inf: + values = values.copy() + values[np.isinf(values)] = np.NaN + + return return_hook, values + +#------------------------------------------------------------------------------- +# Exponential moving moments + +def _get_center_of_mass(com, span): + if span is not None: + if com is not None: + raise Exception("com and span are mutually exclusive") + + # convert span to center of mass + com = (span - 1) / 2. + + elif com is None: + raise Exception("Must pass either com or span") + + return float(com) + +@Substitution("Exponentially-weighted moving average", _unary_arg, "") +@Appender(_ewm_doc) +def ewma(arg, com=None, span=None, min_periods=0, freq=None, time_rule=None): + com = _get_center_of_mass(com, span) + arg = _conv_timerule(arg, freq, time_rule) + + def _ewma(v): + result = _tseries.ewma(v, com) + first_index = _first_valid_index(v) + result[first_index : first_index + min_periods] = NaN + return result + + return_hook, values = _process_data_structure(arg) + output = np.apply_along_axis(_ewma, 0, values) + return return_hook(output) + +def _first_valid_index(arr): + # argmax scans from left + return notnull(arr).argmax() + +@Substitution("Exponentially-weighted moving variance", _unary_arg, _bias_doc) +@Appender(_ewm_doc) +def ewmvar(arg, com=None, span=None, min_periods=0, bias=False, + freq=None, time_rule=None): + com = _get_center_of_mass(com, span) + arg = _conv_timerule(arg, freq, time_rule) + moment2nd = ewma(arg * arg, com=com, min_periods=min_periods) + moment1st = ewma(arg, com=com, min_periods=min_periods) + + result = moment2nd - moment1st ** 2 + if not bias: + result *= (1.0 + 2.0 * com) / (2.0 * com) + + return result + +@Substitution("Exponentially-weighted moving std", _unary_arg, _bias_doc) +@Appender(_ewm_doc) +def ewmstd(arg, com=None, span=None, min_periods=0, bias=False, + time_rule=None): + result = ewmvar(arg, com=com, span=span, time_rule=time_rule, + min_periods=min_periods, bias=bias) + return np.sqrt(result) + +ewmvol = ewmstd + +@Substitution("Exponentially-weighted moving covariance", _binary_arg, "") +@Appender(_ewm_doc) +def ewmcov(arg1, arg2, com=None, span=None, min_periods=0, bias=False, + freq=None, time_rule=None): + X, Y = _prep_binary(arg1, arg2) + + X = _conv_timerule(X, freq, time_rule) + Y = _conv_timerule(Y, freq, time_rule) + + mean = lambda x: ewma(x, com=com, span=span, min_periods=min_periods) + + result = (mean(X*Y) - mean(X) * mean(Y)) + com = _get_center_of_mass(com, span) + if not bias: + result *= (1.0 + 2.0 * com) / (2.0 * com) + + return result + +@Substitution("Exponentially-weighted moving " "correlation", _binary_arg, "") +@Appender(_ewm_doc) +def ewmcorr(arg1, arg2, com=None, span=None, min_periods=0, + freq=None, time_rule=None): + X, Y = _prep_binary(arg1, arg2) + + X = _conv_timerule(X, freq, time_rule) + Y = _conv_timerule(Y, freq, time_rule) + + mean = lambda x: ewma(x, com=com, span=span, min_periods=min_periods) + var = lambda x: ewmvar(x, com=com, span=span, min_periods=min_periods, + bias=True) + return (mean(X*Y) - mean(X)*mean(Y)) / np.sqrt(var(X) * var(Y)) + +def _prep_binary(arg1, arg2): + if not isinstance(arg2, type(arg1)): + raise Exception('Input arrays must be of the same type!') + + # mask out values, this also makes a common index... + X = arg1 + 0 * arg2 + Y = arg2 + 0 * arg1 + + return X, Y + +#---------------------------------------------------------------------- +# Python interface to Cython functions + +def _conv_timerule(arg, freq, time_rule): + if time_rule is not None: + import warnings + warnings.warn("time_rule argument is deprecated, replace with freq", + FutureWarning) + + freq = time_rule + + types = (DataFrame, Series) + if freq is not None and isinstance(arg, types): + # Conform to whatever frequency needed. + arg = arg.resample(freq) + + return arg + +def _require_min_periods(p): + def _check_func(minp, window): + if minp is None: + return window + else: + return max(p, minp) + return _check_func + +def _use_window(minp, window): + if minp is None: + return window + else: + return minp + +def _rolling_func(func, desc, check_minp=_use_window): + @Substitution(desc, _unary_arg, _type_of_input) + @Appender(_doc_template) + @wraps(func) + def f(arg, window, min_periods=None, freq=None, time_rule=None, **kwargs): + def call_cython(arg, window, minp, **kwds): + minp = check_minp(minp, window) + return func(arg, window, minp, **kwds) + return _rolling_moment(arg, window, call_cython, min_periods, + freq=freq, time_rule=time_rule, **kwargs) + + return f + +rolling_max = _rolling_func(_tseries.roll_max, 'Moving maximum') +rolling_min = _rolling_func(_tseries.roll_min, 'Moving minimum') +rolling_sum = _rolling_func(_tseries.roll_sum, 'Moving sum') +rolling_mean = _rolling_func(_tseries.roll_mean, 'Moving mean') +rolling_median = _rolling_func(_tseries.roll_median_cython, 'Moving median') + +_ts_std = lambda *a, **kw: np.sqrt(_tseries.roll_var(*a, **kw)) +rolling_std = _rolling_func(_ts_std, 'Unbiased moving standard deviation', + check_minp=_require_min_periods(2)) +rolling_var = _rolling_func(_tseries.roll_var, 'Unbiased moving variance', + check_minp=_require_min_periods(2)) +rolling_skew = _rolling_func(_tseries.roll_skew, 'Unbiased moving skewness', + check_minp=_require_min_periods(3)) +rolling_kurt = _rolling_func(_tseries.roll_kurt, 'Unbiased moving kurtosis', + check_minp=_require_min_periods(4)) + +def rolling_quantile(arg, window, quantile, min_periods=None, freq=None, + time_rule=None): + """Moving quantile + + Parameters + ---------- + arg : Series, DataFrame + window : Number of observations used for calculating statistic + quantile : 0 <= quantile <= 1 + min_periods : int + Minimum number of observations in window required to have a value + freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic + + Returns + ------- + y : type of input argument + """ + + def call_cython(arg, window, minp): + minp = _use_window(minp, window) + return _tseries.roll_quantile(arg, window, minp, quantile) + return _rolling_moment(arg, window, call_cython, min_periods, + freq=freq, time_rule=time_rule) + +def rolling_apply(arg, window, func, min_periods=None, freq=None, + time_rule=None): + """Generic moving function application + + Parameters + ---------- + arg : Series, DataFrame + window : Number of observations used for calculating statistic + func : function + Must produce a single value from an ndarray input + min_periods : int + Minimum number of observations in window required to have a value + freq : None or string alias / date offset object, default=None + Frequency to conform to before computing statistic + + Returns + ------- + y : type of input argument + """ + def call_cython(arg, window, minp): + minp = _use_window(minp, window) + return _tseries.roll_generic(arg, window, minp, func) + return _rolling_moment(arg, window, call_cython, min_periods, + freq=freq, time_rule=time_rule) diff --git a/pandas/stats/ols.py b/pandas/stats/ols.py new file mode 100644 index 00000000..9b2bf589 --- /dev/null +++ b/pandas/stats/ols.py @@ -0,0 +1,1320 @@ +""" +Ordinary least squares regression +""" + +# pylint: disable-msg=W0201 + +from itertools import izip, starmap +from StringIO import StringIO + +import numpy as np + +from pandas.core.api import DataFrame, Series, isnull +from pandas.core.common import _ensure_float64 +from pandas.core.index import MultiIndex +from pandas.core.panel import Panel +from pandas.util.decorators import cache_readonly + +import pandas.stats.common as scom +import pandas.stats.math as math +import pandas.stats.moments as moments + +_FP_ERR = 1e-8 + +class OLS(object): + """ + Runs a full sample ordinary least squares regression. + + Parameters + ---------- + y: Series + x: Series, DataFrame, dict of Series + intercept: bool + True if you want an intercept. + nw_lags: None or int + Number of Newey-West lags. + """ + _panel_model = False + + def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, + nw_overlap=False): + try: + import statsmodels.api as sm + except ImportError: + import scikits.statsmodels.api as sm + + self._x_orig = x + self._y_orig = y + self._weights_orig = weights + self._intercept = intercept + self._nw_lags = nw_lags + self._nw_overlap = nw_overlap + + (self._y, self._x, self._weights, self._x_filtered, + self._index, self._time_has_obs) = self._prepare_data() + + if self._weights is not None: + self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) + self._y_trans = self._y * np.sqrt(self._weights) + self.sm_ols = sm.WLS(self._y.values, + self._x.values, + weights=self._weights.values).fit() + else: + self._x_trans = self._x + self._y_trans = self._y + self.sm_ols = sm.OLS(self._y.values, + self._x.values).fit() + + def _prepare_data(self): + """ + Cleans the input for single OLS. + + Parameters + ---------- + lhs: Series + Dependent variable in the regression. + rhs: dict, whose values are Series, DataFrame, or dict + Explanatory variables of the regression. + + Returns + ------- + Series, DataFrame + Cleaned lhs and rhs + """ + (filt_lhs, filt_rhs, filt_weights, + pre_filt_rhs, index, valid) = _filter_data(self._y_orig, self._x_orig, + self._weights_orig) + if self._intercept: + filt_rhs['intercept'] = 1. + pre_filt_rhs['intercept'] = 1. + + return (filt_lhs, filt_rhs, filt_weights, + pre_filt_rhs, index, valid) + + @property + def nobs(self): + return self._nobs + + @property + def _nobs(self): + return len(self._y) + + @property + def nw_lags(self): + return self._nw_lags + + @property + def x(self): + """Returns the filtered x used in the regression.""" + return self._x + + @property + def y(self): + """Returns the filtered y used in the regression.""" + return self._y + + @cache_readonly + def _beta_raw(self): + """Runs the regression and returns the beta.""" + return self.sm_ols.params + + @cache_readonly + def beta(self): + """Returns the betas in Series form.""" + return Series(self._beta_raw, index=self._x.columns) + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + return math.rank(self._x.values) + + @cache_readonly + def df(self): + """Returns the degrees of freedom. + + This equals the rank of the X matrix. + """ + return self._df_raw + + @cache_readonly + def _df_model_raw(self): + """Returns the raw model degrees of freedom.""" + return self.sm_ols.df_model + + @cache_readonly + def df_model(self): + """Returns the degrees of freedom of the model.""" + return self._df_model_raw + + @cache_readonly + def _df_resid_raw(self): + """Returns the raw residual degrees of freedom.""" + return self.sm_ols.df_resid + + @cache_readonly + def df_resid(self): + """Returns the degrees of freedom of the residuals.""" + return self._df_resid_raw + + @cache_readonly + def _f_stat_raw(self): + """Returns the raw f-stat value.""" + from scipy.stats import f + + cols = self._x.columns + + if self._nw_lags is None: + F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) + + q = len(cols) + if 'intercept' in cols: + q -= 1 + + shape = q, self.df_resid + p_value = 1 - f.cdf(F, shape[0], shape[1]) + return F, shape, p_value + + k = len(cols) + R = np.eye(k) + r = np.zeros((k, 1)) + + try: + intercept = cols.get_loc('intercept') + R = np.concatenate((R[0 : intercept], R[intercept + 1:])) + r = np.concatenate((r[0 : intercept], r[intercept + 1:])) + except KeyError: + # no intercept + pass + + return math.calc_F(R, r, self._beta_raw, self._var_beta_raw, + self._nobs, self.df) + + @cache_readonly + def f_stat(self): + """Returns the f-stat value.""" + return f_stat_to_dict(self._f_stat_raw) + + def f_test(self, hypothesis): + """Runs the F test, given a joint hypothesis. The hypothesis is + represented by a collection of equations, in the form + + A*x_1+B*x_2=C + + You must provide the coefficients even if they're 1. No spaces. + + The equations can be passed as either a single string or a + list of strings. + + Examples + -------- + o = ols(...) + o.f_test('1*x1+2*x2=0,1*x3=0') + o.f_test(['1*x1+2*x2=0','1*x3=0']) + """ + + x_names = self._x.columns + + R = [] + r = [] + + if isinstance(hypothesis, str): + eqs = hypothesis.split(',') + elif isinstance(hypothesis, list): + eqs = hypothesis + else: # pragma: no cover + raise Exception('hypothesis must be either string or list') + for equation in eqs: + row = np.zeros(len(x_names)) + lhs, rhs = equation.split('=') + for s in lhs.split('+'): + ss = s.split('*') + coeff = float(ss[0]) + x_name = ss[1] + + if x_name not in x_names: + raise Exception('no coefficient named %s' % x_name) + idx = x_names.get_loc(x_name) + row[idx] = coeff + rhs = float(rhs) + + R.append(row) + r.append(rhs) + + R = np.array(R) + q = len(r) + r = np.array(r).reshape(q, 1) + + result = math.calc_F(R, r, self._beta_raw, self._var_beta_raw, + self._nobs, self.df) + + return f_stat_to_dict(result) + + @cache_readonly + def _p_value_raw(self): + """Returns the raw p values.""" + from scipy.stats import t + + return 2 * t.sf(np.fabs(self._t_stat_raw), + self._df_resid_raw) + + @cache_readonly + def p_value(self): + """Returns the p values.""" + return Series(self._p_value_raw, index=self.beta.index) + + @cache_readonly + def _r2_raw(self): + """Returns the raw r-squared values.""" + if self._use_centered_tss: + return 1 - self.sm_ols.ssr / self.sm_ols.centered_tss + else: + return 1 - self.sm_ols.ssr / self.sm_ols.uncentered_tss + + @property + def _use_centered_tss(self): + # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR + return self._intercept + + @cache_readonly + def r2(self): + """Returns the r-squared values.""" + return self._r2_raw + + @cache_readonly + def _r2_adj_raw(self): + """Returns the raw r-squared adjusted values.""" + return self.sm_ols.rsquared_adj + + @cache_readonly + def r2_adj(self): + """Returns the r-squared adjusted values.""" + return self._r2_adj_raw + + @cache_readonly + def _resid_raw(self): + """Returns the raw residuals.""" + return self.sm_ols.resid + + @cache_readonly + def resid(self): + """Returns the residuals.""" + return Series(self._resid_raw, index=self._x.index) + + @cache_readonly + def _rmse_raw(self): + """Returns the raw rmse values.""" + return np.sqrt(self.sm_ols.mse_resid) + + @cache_readonly + def rmse(self): + """Returns the rmse value.""" + return self._rmse_raw + + @cache_readonly + def _std_err_raw(self): + """Returns the raw standard err values.""" + return np.sqrt(np.diag(self._var_beta_raw)) + + @cache_readonly + def std_err(self): + """Returns the standard err values of the betas.""" + return Series(self._std_err_raw, index=self.beta.index) + + @cache_readonly + def _t_stat_raw(self): + """Returns the raw t-stat value.""" + return self._beta_raw / self._std_err_raw + + @cache_readonly + def t_stat(self): + """Returns the t-stat values of the betas.""" + return Series(self._t_stat_raw, index=self.beta.index) + + @cache_readonly + def _var_beta_raw(self): + """ + Returns the raw covariance of beta. + """ + x = self._x.values + y = self._y.values + + xx = np.dot(x.T, x) + + if self._nw_lags is None: + return math.inv(xx) * (self._rmse_raw ** 2) + else: + resid = y - np.dot(x, self._beta_raw) + m = (x.T * resid).T + + xeps = math.newey_west(m, self._nw_lags, self._nobs, self._df_raw, + self._nw_overlap) + + xx_inv = math.inv(xx) + return np.dot(xx_inv, np.dot(xeps, xx_inv)) + + @cache_readonly + def var_beta(self): + """Returns the variance-covariance matrix of beta.""" + return DataFrame(self._var_beta_raw, index=self.beta.index, + columns=self.beta.index) + + @cache_readonly + def _y_fitted_raw(self): + """Returns the raw fitted y values.""" + if self._weights is None: + X = self._x_filtered.values + else: + # XXX + return self.sm_ols.fittedvalues + + b = self._beta_raw + return np.dot(X, b) + + @cache_readonly + def y_fitted(self): + """Returns the fitted y values. This equals BX.""" + if self._weights is None: + index = self._x_filtered.index + orig_index = index + else: + index = self._y.index + orig_index = self._y_orig.index + + result = Series(self._y_fitted_raw, index=index) + return result.reindex(orig_index) + + @cache_readonly + def _y_predict_raw(self): + """Returns the raw predicted y values.""" + return self._y_fitted_raw + + @cache_readonly + def y_predict(self): + """Returns the predicted y values. + + For in-sample, this is same as y_fitted.""" + return self.y_fitted + + def predict(self, beta=None, x=None, fill_value=None, + fill_method=None, axis=0): + """ + Parameters + ---------- + beta : Series + x : Series or DataFrame + fill_value : scalar or dict, default None + fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + axis : {0, 1}, default 0 + See DataFrame.fillna for more details + + Notes + ----- + 1. If both fill_value and fill_method are None then NaNs are dropped + (this is the default behavior) + 2. An intercept will be automatically added to the new_y_values if + the model was fitted using an intercept + + Returns + ------- + Series of predicted values + """ + if beta is None and x is None: + return self.y_predict + + if beta is None: + beta = self.beta + else: + beta = beta.reindex(self.beta.index) + if isnull(beta).any(): + raise ValueError('Must supply betas for same variables') + + if x is None: + x = self._x + orig_x = x + else: + orig_x = x + if fill_value is None and fill_method is None: + x = x.dropna(how='any') + else: + x = x.fillna(value=fill_value, method=fill_method, axis=axis) + if isinstance(x, Series): + x = DataFrame({'x' : x}) + if self._intercept: + x['intercept'] = 1. + + x = x.reindex(columns=self._x.columns) + + rs = np.dot(x.values, beta.values) + return Series(rs, x.index).reindex(orig_x.index) + + RESULT_FIELDS = ['r2', 'r2_adj', 'df', 'df_model', 'df_resid', 'rmse', + 'f_stat', 'beta', 'std_err', 't_stat', 'p_value', 'nobs'] + + @cache_readonly + def _results(self): + results = {} + for result in self.RESULT_FIELDS: + results[result] = getattr(self, result) + + return results + + @cache_readonly + def _coef_table(self): + buf = StringIO() + + buf.write('%14s %10s %10s %10s %10s %10s %10s\n' % + ('Variable', 'Coef', 'Std Err', 't-stat', + 'p-value', 'CI 2.5%', 'CI 97.5%')) + buf.write(scom.banner('')) + coef_template = '\n%14s %10.4f %10.4f %10.2f %10.4f %10.4f %10.4f' + + results = self._results + + beta = results['beta'] + + for i, name in enumerate(beta.index): + if i and not (i % 5): + buf.write('\n' + scom.banner('')) + + std_err = results['std_err'][name] + CI1 = beta[name] - 1.96 * std_err + CI2 = beta[name] + 1.96 * std_err + + t_stat = results['t_stat'][name] + p_value = results['p_value'][name] + + line = coef_template % (name, + beta[name], std_err, t_stat, p_value, CI1, CI2) + + buf.write(line) + + if self.nw_lags is not None: + buf.write('\n') + buf.write('*** The calculations are Newey-West ' + 'adjusted with lags %5d\n' % self.nw_lags) + + return buf.getvalue() + + @cache_readonly + def summary_as_matrix(self): + """Returns the formatted results of the OLS as a DataFrame.""" + results = self._results + beta = results['beta'] + data = {'beta' : results['beta'], + 't-stat' : results['t_stat'], + 'p-value' : results['p_value'], + 'std err' : results['std_err']} + return DataFrame(data, beta.index).T + + @cache_readonly + def summary(self): + """ + This returns the formatted result of the OLS computation + """ + template = """ +%(bannerTop)s + +Formula: Y ~ %(formula)s + +Number of Observations: %(nobs)d +Number of Degrees of Freedom: %(df)d + +R-squared: %(r2)10.4f +Adj R-squared: %(r2_adj)10.4f + +Rmse: %(rmse)10.4f + +F-stat %(f_stat_shape)s: %(f_stat)10.4f, p-value: %(f_stat_p_value)10.4f + +Degrees of Freedom: model %(df_model)d, resid %(df_resid)d + +%(bannerCoef)s +%(coef_table)s +%(bannerEnd)s +""" + coef_table = self._coef_table + + results = self._results + + f_stat = results['f_stat'] + + bracketed = ['<%s>' % c for c in results['beta'].index] + + formula = StringIO() + formula.write(bracketed[0]) + tot = len(bracketed[0]) + line = 1 + for coef in bracketed[1:]: + tot = tot + len(coef) + 3 + + if tot // (68 * line): + formula.write('\n' + ' ' * 12) + line += 1 + + formula.write(' + ' + coef) + + params = { + 'bannerTop' : scom.banner('Summary of Regression Analysis'), + 'bannerCoef' : scom.banner('Summary of Estimated Coefficients'), + 'bannerEnd' : scom.banner('End of Summary'), + 'formula' : formula.getvalue(), + 'r2' : results['r2'], + 'r2_adj' : results['r2_adj'], + 'nobs' : results['nobs'], + 'df' : results['df'], + 'df_model' : results['df_model'], + 'df_resid' : results['df_resid'], + 'coef_table' : coef_table, + 'rmse' : results['rmse'], + 'f_stat' : f_stat['f-stat'], + 'f_stat_shape' : '(%d, %d)' % (f_stat['DF X'], f_stat['DF Resid']), + 'f_stat_p_value' : f_stat['p-value'], + } + + return template % params + + def __repr__(self): + return self.summary + + + @cache_readonly + def _time_obs_count(self): + # XXX + return self._time_has_obs.astype(int) + + @property + def _total_times(self): + return self._time_has_obs.sum() + + +class MovingOLS(OLS): + """ + Runs a rolling/expanding simple OLS. + + Parameters + ---------- + y: Series + x: Series, DataFrame, or dict of Series + intercept: bool + True if you want an intercept. + nw_lags: None or int + Number of Newey-West lags. + window_type: {'full sample', 'rolling', 'expanding'} + Default expanding + window: int + size of window (for rolling/expanding OLS) + """ + def __init__(self, y, x, weights=None, window_type='expanding', + window=None, min_periods=None, intercept=True, + nw_lags=None, nw_overlap=False): + + self._args = dict(intercept=intercept, nw_lags=nw_lags, + nw_overlap=nw_overlap) + + OLS.__init__(self, y=y, x=x, weights=weights, **self._args) + + self._set_window(window_type, window, min_periods) + + def _set_window(self, window_type, window, min_periods): + self._window_type = scom._get_window_type(window_type) + + if self._is_rolling: + assert(window is not None) + if min_periods is None: + min_periods = window + else: + window = len(self._x) + if min_periods is None: + min_periods = 1 + + self._window = int(window) + self._min_periods = min_periods + +#------------------------------------------------------------------------------- +# "Public" results + + @cache_readonly + def beta(self): + """Returns the betas in Series/DataFrame form.""" + return DataFrame(self._beta_raw, + index=self._result_index, + columns=self._x.columns) + + @cache_readonly + def rank(self): + return Series(self._rank_raw, index=self._result_index) + + @cache_readonly + def df(self): + """Returns the degrees of freedom.""" + return Series(self._df_raw, index=self._result_index) + + @cache_readonly + def df_model(self): + """Returns the model degrees of freedom.""" + return Series(self._df_model_raw, index=self._result_index) + + @cache_readonly + def df_resid(self): + """Returns the residual degrees of freedom.""" + return Series(self._df_resid_raw, index=self._result_index) + + @cache_readonly + def f_stat(self): + """Returns the f-stat value.""" + f_stat_dicts = dict((date, f_stat_to_dict(f_stat)) + for date, f_stat in zip(self.beta.index, + self._f_stat_raw)) + + return DataFrame(f_stat_dicts).T + + def f_test(self, hypothesis): + raise NotImplementedError('must use full sample') + + @cache_readonly + def forecast_mean(self): + return Series(self._forecast_mean_raw, index=self._result_index) + + @cache_readonly + def forecast_vol(self): + return Series(self._forecast_vol_raw, index=self._result_index) + + @cache_readonly + def p_value(self): + """Returns the p values.""" + cols = self.beta.columns + return DataFrame(self._p_value_raw, columns=cols, + index=self._result_index) + + @cache_readonly + def r2(self): + """Returns the r-squared values.""" + return Series(self._r2_raw, index=self._result_index) + + @cache_readonly + def resid(self): + """Returns the residuals.""" + return Series(self._resid_raw[self._valid_obs_labels], + index=self._result_index) + + @cache_readonly + def r2_adj(self): + """Returns the r-squared adjusted values.""" + index = self.r2.index + + return Series(self._r2_adj_raw, index=index) + + @cache_readonly + def rmse(self): + """Returns the rmse values.""" + return Series(self._rmse_raw, index=self._result_index) + + @cache_readonly + def std_err(self): + """Returns the standard err values.""" + return DataFrame(self._std_err_raw, columns=self.beta.columns, + index=self._result_index) + + @cache_readonly + def t_stat(self): + """Returns the t-stat value.""" + return DataFrame(self._t_stat_raw, columns=self.beta.columns, + index=self._result_index) + + @cache_readonly + def var_beta(self): + """Returns the covariance of beta.""" + result = {} + result_index = self._result_index + for i in xrange(len(self._var_beta_raw)): + dm = DataFrame(self._var_beta_raw[i], columns=self.beta.columns, + index=self.beta.columns) + result[result_index[i]] = dm + + return Panel.from_dict(result, intersect=False) + + @cache_readonly + def y_fitted(self): + """Returns the fitted y values.""" + return Series(self._y_fitted_raw[self._valid_obs_labels], + index=self._result_index) + + @cache_readonly + def y_predict(self): + """Returns the predicted y values.""" + return Series(self._y_predict_raw[self._valid_obs_labels], + index=self._result_index) + +#------------------------------------------------------------------------------- +# "raw" attributes, calculations + + @property + def _is_rolling(self): + return self._window_type == 'rolling' + + @cache_readonly + def _beta_raw(self): + """Runs the regression and returns the beta.""" + beta, indices, mask = self._rolling_ols_call + + return beta[indices] + + @cache_readonly + def _result_index(self): + return self._index[self._valid_indices] + + @property + def _valid_indices(self): + return self._rolling_ols_call[1] + + @cache_readonly + def _rolling_ols_call(self): + return self._calc_betas(self._x_trans, self._y_trans) + + def _calc_betas(self, x, y): + N = len(self._index) + K = len(self._x.columns) + + betas = np.empty((N, K), dtype=float) + betas[:] = np.NaN + + valid = self._time_has_obs + enough = self._enough_obs + window = self._window + + # Use transformed (demeaned) Y, X variables + cum_xx = self._cum_xx(x) + cum_xy = self._cum_xy(x, y) + + for i in xrange(N): + if not valid[i] or not enough[i]: + continue + + xx = cum_xx[i] + xy = cum_xy[i] + if self._is_rolling and i >= window: + xx = xx - cum_xx[i - window] + xy = xy - cum_xy[i - window] + + betas[i] = math.solve(xx, xy) + + mask = -np.isnan(betas).any(axis=1) + have_betas = np.arange(N)[mask] + + return betas, have_betas, mask + + def _rolling_rank(self): + dates = self._index + window = self._window + + ranks = np.empty(len(dates), dtype=float) + ranks[:] = np.NaN + for i, date in enumerate(dates): + if self._is_rolling and i >= window: + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + x_slice = self._x.truncate(before=prior_date, after=date).values + + if len(x_slice) == 0: + continue + + ranks[i] = math.rank(x_slice) + + return ranks + + def _cum_xx(self, x): + dates = self._index + K = len(x.columns) + valid = self._time_has_obs + cum_xx = [] + + slicer = lambda df, dt: df.truncate(dt, dt).values + if not self._panel_model: + _get_index = x.index.get_loc + def slicer(df, dt): + i = _get_index(dt) + return df.values[i:i+1, :] + + last = np.zeros((K, K)) + + for i, date in enumerate(dates): + if not valid[i]: + cum_xx.append(last) + continue + + x_slice = slicer(x, date) + xx = last = last + np.dot(x_slice.T, x_slice) + cum_xx.append(xx) + + return cum_xx + + def _cum_xy(self, x, y): + dates = self._index + valid = self._time_has_obs + cum_xy = [] + + x_slicer = lambda df, dt: df.truncate(dt, dt).values + if not self._panel_model: + _get_index = x.index.get_loc + def x_slicer(df, dt): + i = _get_index(dt) + return df.values[i:i+1] + + _y_get_index = y.index.get_loc + _values = y.values + if isinstance(y.index, MultiIndex): + def y_slicer(df, dt): + loc = _y_get_index(dt) + return _values[loc] + else: + def y_slicer(df, dt): + i = _y_get_index(dt) + return _values[i:i+1] + + last = np.zeros(len(x.columns)) + for i, date in enumerate(dates): + if not valid[i]: + cum_xy.append(last) + continue + + x_slice = x_slicer(x, date) + y_slice = y_slicer(y, date) + + xy = last = last + np.dot(x_slice.T, y_slice) + cum_xy.append(xy) + + return cum_xy + + @cache_readonly + def _rank_raw(self): + rank = self._rolling_rank() + return rank[self._valid_indices] + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + return self._rank_raw + + @cache_readonly + def _df_model_raw(self): + """Returns the raw model degrees of freedom.""" + return self._df_raw - 1 + + @cache_readonly + def _df_resid_raw(self): + """Returns the raw residual degrees of freedom.""" + return self._nobs - self._df_raw + + @cache_readonly + def _f_stat_raw(self): + """Returns the raw f-stat value.""" + from scipy.stats import f + + items = self.beta.columns + nobs = self._nobs + df = self._df_raw + df_resid = nobs - df + + # var_beta has not been newey-west adjusted + if self._nw_lags is None: + F = self._r2_raw / (self._r2_raw - self._r2_adj_raw) + + q = len(items) + if 'intercept' in items: + q -= 1 + + def get_result_simple(Fst, d): + return Fst, (q, d), 1 - f.cdf(Fst, q, d) + + # Compute the P-value for each pair + result = starmap(get_result_simple, izip(F, df_resid)) + + return list(result) + + K = len(items) + R = np.eye(K) + r = np.zeros((K, 1)) + + try: + intercept = items.get_loc('intercept') + R = np.concatenate((R[0 : intercept], R[intercept + 1:])) + r = np.concatenate((r[0 : intercept], r[intercept + 1:])) + except KeyError: + # no intercept + pass + + def get_result(beta, vcov, n, d): + return math.calc_F(R, r, beta, vcov, n, d) + + results = starmap(get_result, + izip(self._beta_raw, self._var_beta_raw, nobs, df)) + + return list(results) + + @cache_readonly + def _p_value_raw(self): + """Returns the raw p values.""" + from scipy.stats import t + + result = [2 * t.sf(a, b) + for a, b in izip(np.fabs(self._t_stat_raw), + self._df_resid_raw)] + + return np.array(result) + + @cache_readonly + def _resid_stats(self): + uncentered_sst = [] + sst = [] + sse = [] + + Y = self._y_trans + X = self._x_trans + + dates = self._index + window = self._window + for n, index in enumerate(self._valid_indices): + if self._is_rolling and index >= window: + prior_date = dates[index - window + 1] + else: + prior_date = dates[0] + + date = dates[index] + beta = self._beta_raw[n] + + X_slice = X.truncate(before=prior_date, after=date).values + Y_slice = _y_converter(Y.truncate(before=prior_date, after=date)) + + resid = Y_slice - np.dot(X_slice, beta) + + SS_err = (resid ** 2).sum() + SS_total = ((Y_slice - Y_slice.mean()) ** 2).sum() + SST_uncentered = (Y_slice ** 2).sum() + + sse.append(SS_err) + sst.append(SS_total) + uncentered_sst.append(SST_uncentered) + + return { + 'sse' : np.array(sse), + 'centered_tss' : np.array(sst), + 'uncentered_tss' : np.array(uncentered_sst), + } + + @cache_readonly + def _rmse_raw(self): + """Returns the raw rmse values.""" + return np.sqrt(self._resid_stats['sse'] / self._df_resid_raw) + + @cache_readonly + def _r2_raw(self): + rs = self._resid_stats + + if self._use_centered_tss: + return 1 - rs['sse'] / rs['centered_tss'] + else: + return 1 - rs['sse'] / rs['uncentered_tss'] + + @cache_readonly + def _r2_adj_raw(self): + """Returns the raw r-squared adjusted values.""" + nobs = self._nobs + factors = (nobs - 1) / (nobs - self._df_raw) + return 1 - (1 - self._r2_raw) * factors + + @cache_readonly + def _resid_raw(self): + """Returns the raw residuals.""" + return (self._y.values - self._y_fitted_raw) + + @cache_readonly + def _std_err_raw(self): + """Returns the raw standard err values.""" + results = [] + for i in xrange(len(self._var_beta_raw)): + results.append(np.sqrt(np.diag(self._var_beta_raw[i]))) + + return np.array(results) + + @cache_readonly + def _t_stat_raw(self): + """Returns the raw t-stat value.""" + return self._beta_raw / self._std_err_raw + + @cache_readonly + def _var_beta_raw(self): + """Returns the raw covariance of beta.""" + x = self._x_trans + y = self._y_trans + dates = self._index + nobs = self._nobs + rmse = self._rmse_raw + beta = self._beta_raw + df = self._df_raw + window = self._window + cum_xx = self._cum_xx(self._x) + + results = [] + for n, i in enumerate(self._valid_indices): + xx = cum_xx[i] + date = dates[i] + + if self._is_rolling and i >= window: + xx = xx - cum_xx[i - window] + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + x_slice = x.truncate(before=prior_date, after=date) + y_slice = y.truncate(before=prior_date, after=date) + xv = x_slice.values + yv = np.asarray(y_slice) + + if self._nw_lags is None: + result = math.inv(xx) * (rmse[n] ** 2) + else: + resid = yv - np.dot(xv, beta[n]) + m = (xv.T * resid).T + + xeps = math.newey_west(m, self._nw_lags, nobs[n], df[n], + self._nw_overlap) + + xx_inv = math.inv(xx) + result = np.dot(xx_inv, np.dot(xeps, xx_inv)) + + results.append(result) + + return np.array(results) + + @cache_readonly + def _forecast_mean_raw(self): + """Returns the raw covariance of beta.""" + nobs = self._nobs + window = self._window + + # x should be ones + dummy = DataFrame(index=self._y.index) + dummy['y'] = 1 + + cum_xy = self._cum_xy(dummy, self._y) + + results = [] + for n, i in enumerate(self._valid_indices): + sumy = cum_xy[i] + + if self._is_rolling and i >= window: + sumy = sumy - cum_xy[i - window] + + results.append(sumy[0] / nobs[n]) + + return np.array(results) + + @cache_readonly + def _forecast_vol_raw(self): + """Returns the raw covariance of beta.""" + beta = self._beta_raw + window = self._window + dates = self._index + x = self._x + + results = [] + for n, i in enumerate(self._valid_indices): + date = dates[i] + if self._is_rolling and i >= window: + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + x_slice = x.truncate(prior_date, date).values + x_demeaned = x_slice - x_slice.mean(0) + x_cov = np.dot(x_demeaned.T, x_demeaned) / (len(x_slice) - 1) + + B = beta[n] + result = np.dot(B, np.dot(x_cov, B)) + results.append(np.sqrt(result)) + + return np.array(results) + + @cache_readonly + def _y_fitted_raw(self): + """Returns the raw fitted y values.""" + return (self._x.values * self._beta_matrix(lag=0)).sum(1) + + @cache_readonly + def _y_predict_raw(self): + """Returns the raw predicted y values.""" + return (self._x.values * self._beta_matrix(lag=1)).sum(1) + + @cache_readonly + def _results(self): + results = {} + for result in self.RESULT_FIELDS: + value = getattr(self, result) + if isinstance(value, Series): + value = value[self.beta.index[-1]] + elif isinstance(value, DataFrame): + value = value.xs(self.beta.index[-1]) + else: # pragma: no cover + raise Exception('Problem retrieving %s' % result) + results[result] = value + + return results + + @cache_readonly + def _window_time_obs(self): + window_obs = moments.rolling_sum(self._time_obs_count > 0, + self._window, min_periods=1) + + window_obs[np.isnan(window_obs)] = 0 + return window_obs.astype(int) + + @cache_readonly + def _nobs_raw(self): + if self._is_rolling: + window = self._window + else: + # expanding case + window = len(self._index) + + result = moments.rolling_sum(self._time_obs_count, window, + min_periods=1) + + return result.astype(int) + + def _beta_matrix(self, lag=0): + assert(lag >= 0) + + betas = self._beta_raw + + labels = np.arange(len(self._y)) - lag + indexer = self._valid_obs_labels.searchsorted(labels, side='left') + indexer[indexer == len(betas)] = len(betas) - 1 + + beta_matrix = betas[indexer] + beta_matrix[labels < self._valid_obs_labels[0]] = np.NaN + + return beta_matrix + + @cache_readonly + def _valid_obs_labels(self): + dates = self._index[self._valid_indices] + return self._y.index.searchsorted(dates) + + @cache_readonly + def _nobs(self): + return self._nobs_raw[self._valid_indices] + + @property + def nobs(self): + return Series(self._nobs, index=self._result_index) + + @cache_readonly + def _enough_obs(self): + # XXX: what's the best way to determine where to start? + return self._nobs_raw >= max(self._min_periods, + len(self._x.columns) + 1) + +def _safe_update(d, other): + """ + Combine dictionaries with non-overlapping keys + """ + for k, v in other.iteritems(): + if k in d: + raise Exception('Duplicate regressor: %s' % k) + + d[k] = v + +def _filter_data(lhs, rhs, weights=None): + """ + Cleans the input for single OLS. + + Parameters + ---------- + lhs: Series + Dependent variable in the regression. + rhs: dict, whose values are Series, DataFrame, or dict + Explanatory variables of the regression. + + Returns + ------- + Series, DataFrame + Cleaned lhs and rhs + """ + if not isinstance(lhs, Series): + assert(len(lhs) == len(rhs)) + lhs = Series(lhs, index=rhs.index) + + rhs = _combine_rhs(rhs) + lhs = DataFrame({'__y__' : lhs}, dtype=float) + pre_filt_rhs = rhs.dropna(how='any') + + combined = rhs.join(lhs, how='outer') + if weights is not None: + combined['__weights__'] = weights + + valid = (combined.count(1) == len(combined.columns)).values + index = combined.index + combined = combined[valid] + + if weights is not None: + filt_weights = combined.pop('__weights__') + else: + filt_weights = None + + filt_lhs = combined.pop('__y__') + filt_rhs = combined + + return (filt_lhs, filt_rhs, filt_weights, + pre_filt_rhs, index, valid) + + +def _combine_rhs(rhs): + """ + Glue input X variables together while checking for potential + duplicates + """ + series = {} + + if isinstance(rhs, Series): + series['x'] = rhs + elif isinstance(rhs, DataFrame): + series = rhs.copy() + elif isinstance(rhs, dict): + for name, value in rhs.iteritems(): + if isinstance(value, Series): + _safe_update(series, {name : value}) + elif isinstance(value, (dict, DataFrame)): + _safe_update(series, value) + else: # pragma: no cover + raise Exception('Invalid RHS data type: %s' % type(value)) + else: # pragma: no cover + raise Exception('Invalid RHS type: %s' % type(rhs)) + + if not isinstance(series, DataFrame): + series = DataFrame(series, dtype=float) + + return series + +# A little kludge so we can use this method for both +# MovingOLS and MovingPanelOLS +def _y_converter(y): + y = y.values.squeeze() + if y.ndim == 0: # pragma: no cover + return np.array([y]) + else: + return y + + +def f_stat_to_dict(result): + f_stat, shape, p_value = result + + result = {} + result['f-stat'] = f_stat + result['DF X'] = shape[0] + result['DF Resid'] = shape[1] + result['p-value'] = p_value + + return result + diff --git a/pandas/stats/plm.py b/pandas/stats/plm.py new file mode 100644 index 00000000..7b6f85b1 --- /dev/null +++ b/pandas/stats/plm.py @@ -0,0 +1,794 @@ +""" +Linear regression objects for panel data +""" + +# pylint: disable-msg=W0231 +# pylint: disable-msg=E1101,E1103 + +from __future__ import division +import warnings + +import numpy as np + +from pandas.core.panel import Panel +from pandas.core.frame import DataFrame +from pandas.core.reshape import get_dummies +from pandas.core.series import Series +from pandas.core.sparse import SparsePanel +from pandas.stats.ols import OLS, MovingOLS +import pandas.stats.common as com +import pandas.stats.math as math +from pandas.util.decorators import cache_readonly + +class PanelOLS(OLS): + """Implements panel OLS. + + See ols function docs + """ + _panel_model = True + + def __init__(self, y, x, weights=None, intercept=True, nw_lags=None, + entity_effects=False, time_effects=False, x_effects=None, + cluster=None, dropped_dummies=None, verbose=False, + nw_overlap=False): + self._x_orig = x + self._y_orig = y + self._weights = weights + + self._intercept = intercept + self._nw_lags = nw_lags + self._nw_overlap = nw_overlap + self._entity_effects = entity_effects + self._time_effects = time_effects + self._x_effects = x_effects + self._dropped_dummies = dropped_dummies or {} + self._cluster = com._get_cluster_type(cluster) + self._verbose = verbose + + (self._x, self._x_trans, + self._x_filtered, self._y, + self._y_trans) = self._prepare_data() + + self._index = self._x.index.levels[0] + + self._T = len(self._index) + + def log(self, msg): + if self._verbose: # pragma: no cover + print msg + + def _prepare_data(self): + """Cleans and stacks input data into DataFrame objects + + If time effects is True, then we turn off intercepts and omit an item + from every (entity and x) fixed effect. + + Otherwise: + - If we have an intercept, we omit an item from every fixed effect. + - Else, we omit an item from every fixed effect except one of them. + + The categorical variables will get dropped from x. + """ + (x, x_filtered, y, weights, cat_mapping) = self._filter_data() + + self.log('Adding dummies to X variables') + x = self._add_dummies(x, cat_mapping) + + self.log('Adding dummies to filtered X variables') + x_filtered = self._add_dummies(x_filtered, cat_mapping) + + if self._x_effects: + x = x.drop(self._x_effects, axis=1) + x_filtered = x_filtered.drop(self._x_effects, axis=1) + + if self._time_effects: + x_regressor = x.sub(x.mean(level=0), level=0) + + unstacked_y = y.unstack() + y_regressor = unstacked_y.sub(unstacked_y.mean(1), axis=0).stack() + y_regressor.index = y.index + + elif self._intercept: + # only add intercept when no time effects + self.log('Adding intercept') + x = x_regressor = add_intercept(x) + x_filtered = add_intercept(x_filtered) + y_regressor = y + else: + self.log('No intercept added') + x_regressor = x + y_regressor = y + + if weights is not None: + assert(y_regressor.index.equals(weights.index)) + assert(x_regressor.index.equals(weights.index)) + + rt_weights = np.sqrt(weights) + y_regressor = y_regressor * rt_weights + x_regressor = x_regressor.mul(rt_weights, axis=0) + + return x, x_regressor, x_filtered, y, y_regressor + + def _filter_data(self): + """ + + """ + data = self._x_orig + cat_mapping = {} + + if isinstance(data, DataFrame): + data = data.to_panel() + else: + if isinstance(data, Panel): + data = data.copy() + + if not isinstance(data, SparsePanel): + data, cat_mapping = self._convert_x(data) + + if not isinstance(data, Panel): + data = Panel.from_dict(data, intersect=True) + + x_names = data.items + + if self._weights is not None: + data['__weights__'] = self._weights + + # Filter x's without y (so we can make a prediction) + filtered = data.to_frame() + + # Filter all data together using to_frame + + # convert to DataFrame + y = self._y_orig + if isinstance(y, Series): + y = y.unstack() + + data['__y__'] = y + data_long = data.to_frame() + + x_filt = filtered.filter(x_names) + x = data_long.filter(x_names) + y = data_long['__y__'] + + if self._weights is not None and not self._weights.empty: + weights = data_long['__weights__'] + else: + weights = None + + return x, x_filt, y, weights, cat_mapping + + def _convert_x(self, x): + # Converts non-numeric data in x to floats. x_converted is the + # DataFrame with converted values, and x_conversion is a dict that + # provides the reverse mapping. For example, if 'A' was converted to 0 + # for x named 'variety', then x_conversion['variety'][0] is 'A'. + x_converted = {} + cat_mapping = {} + # x can be either a dict or a Panel, but in Python 3, dicts don't have + # .iteritems + iteritems = getattr(x, 'iteritems', x.items) + for key, df in iteritems(): + assert(isinstance(df, DataFrame)) + + if _is_numeric(df): + x_converted[key] = df + else: + try: + df = df.astype(float) + except (TypeError, ValueError): + values = df.values + distinct_values = sorted(set(values.flat)) + cat_mapping[key] = dict(enumerate(distinct_values)) + new_values = np.searchsorted(distinct_values, values) + x_converted[key] = DataFrame(new_values, index=df.index, + columns=df.columns) + + if len(cat_mapping) == 0: + x_converted = x + + return x_converted, cat_mapping + + def _add_dummies(self, panel, mapping): + """ + Add entity and / or categorical dummies to input X DataFrame + + Returns + ------- + DataFrame + """ + panel = self._add_entity_effects(panel) + panel = self._add_categorical_dummies(panel, mapping) + + return panel + + def _add_entity_effects(self, panel): + """ + Add entity dummies to panel + + Returns + ------- + DataFrame + """ + from pandas.core.reshape import make_axis_dummies + + if not self._entity_effects: + return panel + + self.log('-- Adding entity fixed effect dummies') + + dummies = make_axis_dummies(panel, 'minor') + + if not self._use_all_dummies: + if 'entity' in self._dropped_dummies: + to_exclude = str(self._dropped_dummies.get('entity')) + else: + to_exclude = dummies.columns[0] + + if to_exclude not in dummies.columns: + raise Exception('%s not in %s' % (to_exclude, + dummies.columns)) + + self.log('-- Excluding dummy for entity: %s' % to_exclude) + + dummies = dummies.filter(dummies.columns - [to_exclude]) + + dummies = dummies.add_prefix('FE_') + panel = panel.join(dummies) + + return panel + + def _add_categorical_dummies(self, panel, cat_mappings): + """ + Add categorical dummies to panel + + Returns + ------- + DataFrame + """ + if not self._x_effects: + return panel + + dropped_dummy = (self._entity_effects and not self._use_all_dummies) + + for effect in self._x_effects: + self.log('-- Adding fixed effect dummies for %s' % effect) + + dummies = get_dummies(panel[effect]) + + val_map = cat_mappings.get(effect) + if val_map: + val_map = dict((v, k) for k, v in val_map.iteritems()) + + if dropped_dummy or not self._use_all_dummies: + if effect in self._dropped_dummies: + to_exclude = mapped_name = self._dropped_dummies.get(effect) + + if val_map: + mapped_name = val_map[to_exclude] + else: + to_exclude = mapped_name = dummies.columns[0] + + if mapped_name not in dummies.columns: # pragma: no cover + raise Exception('%s not in %s' % (to_exclude, + dummies.columns)) + + self.log('-- Excluding dummy for %s: %s' % (effect, to_exclude)) + + dummies = dummies.filter(dummies.columns - [mapped_name]) + dropped_dummy = True + + dummies = _convertDummies(dummies, cat_mappings.get(effect)) + dummies = dummies.add_prefix('%s_' % effect) + panel = panel.join(dummies) + + return panel + + @property + def _use_all_dummies(self): + """ + In the case of using an intercept or including time fixed + effects, completely partitioning the sample would make the X + not full rank. + """ + return (not self._intercept and not self._time_effects) + + @cache_readonly + def _beta_raw(self): + """Runs the regression and returns the beta.""" + X = self._x_trans.values + Y = self._y_trans.values.squeeze() + + beta, _, _, _ = np.linalg.lstsq(X, Y) + + return beta + + @cache_readonly + def beta(self): + return Series(self._beta_raw, index=self._x.columns) + + @cache_readonly + def _df_model_raw(self): + """Returns the raw model degrees of freedom.""" + return self._df_raw - 1 + + @cache_readonly + def _df_resid_raw(self): + """Returns the raw residual degrees of freedom.""" + return self._nobs - self._df_raw + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + df = math.rank(self._x_trans.values) + if self._time_effects: + df += self._total_times + + return df + + @cache_readonly + def _r2_raw(self): + Y = self._y_trans.values.squeeze() + X = self._x_trans.values + + resid = Y - np.dot(X, self._beta_raw) + + SSE = (resid ** 2).sum() + + if self._use_centered_tss: + SST = ((Y - np.mean(Y)) ** 2).sum() + else: + SST = (Y**2).sum() + + return 1 - SSE / SST + + @property + def _use_centered_tss(self): + # has_intercept = np.abs(self._resid_raw.sum()) < _FP_ERR + return self._intercept or self._entity_effects or self._time_effects + + @cache_readonly + def _r2_adj_raw(self): + """Returns the raw r-squared adjusted values.""" + nobs = self._nobs + factors = (nobs - 1) / (nobs - self._df_raw) + return 1 - (1 - self._r2_raw) * factors + + @cache_readonly + def _resid_raw(self): + Y = self._y.values.squeeze() + X = self._x.values + return Y - np.dot(X, self._beta_raw) + + @cache_readonly + def resid(self): + return self._unstack_vector(self._resid_raw) + + @cache_readonly + def _rmse_raw(self): + """Returns the raw rmse values.""" + # X = self._x.values + # Y = self._y.values.squeeze() + + X = self._x_trans.values + Y = self._y_trans.values.squeeze() + + resid = Y - np.dot(X, self._beta_raw) + ss = (resid ** 2).sum() + return np.sqrt(ss / (self._nobs - self._df_raw)) + + @cache_readonly + def _var_beta_raw(self): + cluster_axis = None + if self._cluster == 'time': + cluster_axis = 0 + elif self._cluster == 'entity': + cluster_axis = 1 + + x = self._x + y = self._y + + if self._time_effects: + xx = _xx_time_effects(x, y) + else: + xx = np.dot(x.values.T, x.values) + + return _var_beta_panel(y, x, self._beta_raw, xx, + self._rmse_raw, cluster_axis, self._nw_lags, + self._nobs, self._df_raw, self._nw_overlap) + + @cache_readonly + def _y_fitted_raw(self): + """Returns the raw fitted y values.""" + return np.dot(self._x.values, self._beta_raw) + + @cache_readonly + def y_fitted(self): + return self._unstack_vector(self._y_fitted_raw, index=self._x.index) + + def _unstack_vector(self, vec, index=None): + if index is None: + index = self._y_trans.index + panel = DataFrame(vec, index=index, columns=['dummy']) + return panel.to_panel()['dummy'] + + def _unstack_y(self, vec): + unstacked = self._unstack_vector(vec) + return unstacked.reindex(self.beta.index) + + @cache_readonly + def _time_obs_count(self): + return self._y_trans.count(level=0).values + + @cache_readonly + def _time_has_obs(self): + return self._time_obs_count > 0 + + @property + def _nobs(self): + return len(self._y) + +def _convertDummies(dummies, mapping): + # cleans up the names of the generated dummies + new_items = [] + for item in dummies.columns: + if not mapping: + var = str(item) + if isinstance(item, float): + var = '%g' % item + + new_items.append(var) + else: + # renames the dummies if a conversion dict is provided + new_items.append(mapping[int(item)]) + + dummies = DataFrame(dummies.values, index=dummies.index, + columns=new_items) + + return dummies + +def _is_numeric(df): + for col in df: + if df[col].dtype.name == 'object': + return False + + return True + +def add_intercept(panel, name='intercept'): + """ + Add column of ones to input panel + + Parameters + ---------- + panel: Panel / DataFrame + name: string, default 'intercept'] + + Returns + ------- + New object (same type as input) + """ + panel = panel.copy() + panel[name] = 1. + + return panel.consolidate() + +class MovingPanelOLS(MovingOLS, PanelOLS): + """Implements rolling/expanding panel OLS. + + See ols function docs + """ + _panel_model = True + + def __init__(self, y, x, weights=None, + window_type='expanding', window=None, + min_periods=None, + min_obs=None, + intercept=True, + nw_lags=None, nw_overlap=False, + entity_effects=False, + time_effects=False, + x_effects=None, + cluster=None, + dropped_dummies=None, + verbose=False): + + self._args = dict(intercept=intercept, + nw_lags=nw_lags, + nw_overlap=nw_overlap, + entity_effects=entity_effects, + time_effects=time_effects, + x_effects=x_effects, + cluster=cluster, + dropped_dummies=dropped_dummies, + verbose=verbose) + + PanelOLS.__init__(self, y=y, x=x, weights=weights, + **self._args) + + self._set_window(window_type, window, min_periods) + + if min_obs is None: + min_obs = len(self._x.columns) + 1 + + self._min_obs = min_obs + + @cache_readonly + def resid(self): + return self._unstack_y(self._resid_raw) + + @cache_readonly + def y_fitted(self): + return self._unstack_y(self._y_fitted_raw) + + @cache_readonly + def y_predict(self): + """Returns the predicted y values.""" + return self._unstack_y(self._y_predict_raw) + + def lagged_y_predict(self, lag=1): + """ + Compute forecast Y value lagging coefficient by input number + of time periods + + Parameters + ---------- + lag : int + + Returns + ------- + DataFrame + """ + x = self._x.values + betas = self._beta_matrix(lag=lag) + return self._unstack_y((betas * x).sum(1)) + + @cache_readonly + def _rolling_ols_call(self): + return self._calc_betas(self._x_trans, self._y_trans) + + @cache_readonly + def _df_raw(self): + """Returns the degrees of freedom.""" + df = self._rolling_rank() + + if self._time_effects: + df += self._window_time_obs + + return df[self._valid_indices] + + @cache_readonly + def _var_beta_raw(self): + """Returns the raw covariance of beta.""" + x = self._x + y = self._y + + dates = x.index.levels[0] + + cluster_axis = None + if self._cluster == 'time': + cluster_axis = 0 + elif self._cluster == 'entity': + cluster_axis = 1 + + nobs = self._nobs + rmse = self._rmse_raw + beta = self._beta_raw + df = self._df_raw + window = self._window + + if not self._time_effects: + # Non-transformed X + cum_xx = self._cum_xx(x) + + results = [] + for n, i in enumerate(self._valid_indices): + if self._is_rolling and i >= window: + prior_date = dates[i - window + 1] + else: + prior_date = dates[0] + + date = dates[i] + + x_slice = x.truncate(prior_date, date) + y_slice = y.truncate(prior_date, date) + + if self._time_effects: + xx = _xx_time_effects(x_slice, y_slice) + else: + xx = cum_xx[i] + if self._is_rolling and i >= window: + xx = xx - cum_xx[i - window] + + result = _var_beta_panel(y_slice, x_slice, beta[n], xx, rmse[n], + cluster_axis, self._nw_lags, + nobs[n], df[n], self._nw_overlap) + + results.append(result) + + return np.array(results) + + @cache_readonly + def _resid_raw(self): + beta_matrix = self._beta_matrix(lag=0) + + Y = self._y.values.squeeze() + X = self._x.values + resid = Y - (X * beta_matrix).sum(1) + + return resid + + @cache_readonly + def _y_fitted_raw(self): + x = self._x.values + betas = self._beta_matrix(lag=0) + return (betas * x).sum(1) + + @cache_readonly + def _y_predict_raw(self): + """Returns the raw predicted y values.""" + x = self._x.values + betas = self._beta_matrix(lag=1) + return (betas * x).sum(1) + + def _beta_matrix(self, lag=0): + assert(lag >= 0) + + index = self._y_trans.index + major_labels = index.labels[0] + labels = major_labels - lag + indexer = self._valid_indices.searchsorted(labels, side='left') + + beta_matrix = self._beta_raw[indexer] + beta_matrix[labels < self._valid_indices[0]] = np.NaN + + return beta_matrix + + @cache_readonly + def _enough_obs(self): + # XXX: what's the best way to determine where to start? + # TODO: write unit tests for this + + rank_threshold = len(self._x.columns) + 1 + if self._min_obs < rank_threshold: # pragma: no cover + warnings.warn('min_obs is smaller than rank of X matrix') + + enough_observations = self._nobs_raw >= self._min_obs + enough_time_periods = self._window_time_obs >= self._min_periods + return enough_time_periods & enough_observations + +def create_ols_dict(attr): + def attr_getter(self): + d = {} + for k, v in self.results.iteritems(): + result = getattr(v, attr) + d[k] = result + + return d + + return attr_getter + +def create_ols_attr(attr): + return property(create_ols_dict(attr)) + +class NonPooledPanelOLS(object): + """Implements non-pooled panel OLS. + + Parameters + ---------- + y : DataFrame + x : Series, DataFrame, or dict of Series + intercept : bool + True if you want an intercept. + nw_lags : None or int + Number of Newey-West lags. + window_type : {'full_sample', 'rolling', 'expanding'} + 'full_sample' by default + window : int + size of window (for rolling/expanding OLS) + """ + + ATTRIBUTES = [ + 'beta', + 'df', + 'df_model', + 'df_resid', + 'f_stat', + 'p_value', + 'r2', + 'r2_adj', + 'resid', + 'rmse', + 'std_err', + 'summary_as_matrix', + 't_stat', + 'var_beta', + 'x', + 'y', + 'y_fitted', + 'y_predict' + ] + + def __init__(self, y, x, window_type='full_sample', window=None, + min_periods=None, intercept=True, nw_lags=None, + nw_overlap=False): + + for attr in self.ATTRIBUTES: + setattr(self.__class__, attr, create_ols_attr(attr)) + + results = {} + + for entity in y: + entity_y = y[entity] + + entity_x = {} + for x_var in x: + entity_x[x_var] = x[x_var][entity] + + from pandas.stats.interface import ols + results[entity] = ols(y=entity_y, + x=entity_x, + window_type=window_type, + window=window, + min_periods=min_periods, + intercept=intercept, + nw_lags=nw_lags, + nw_overlap=nw_overlap) + + self.results = results + + +def _var_beta_panel(y, x, beta, xx, rmse, cluster_axis, + nw_lags, nobs, df, nw_overlap): + from pandas.core.frame import group_agg + xx_inv = math.inv(xx) + + yv = y.values + + if cluster_axis is None: + if nw_lags is None: + return xx_inv * (rmse ** 2) + else: + resid = yv - np.dot(x.values, beta) + m = (x.values.T * resid).T + + xeps = math.newey_west(m, nw_lags, nobs, df, nw_overlap) + + return np.dot(xx_inv, np.dot(xeps, xx_inv)) + else: + Xb = np.dot(x.values, beta).reshape((len(x.values), 1)) + resid = DataFrame(yv[:, None] - Xb, index=y.index, columns=['resid']) + + if cluster_axis == 1: + x = x.swaplevel(0, 1).sortlevel(0) + resid = resid.swaplevel(0, 1).sortlevel(0) + + m = group_agg(x.values * resid.values, x.index._bounds, + lambda x: np.sum(x, axis=0)) + + if nw_lags is None: + nw_lags = 0 + + xox = 0 + for i in range(len(x.index.levels[0])): + xox += math.newey_west(m[i : i + 1], nw_lags, + nobs, df, nw_overlap) + + return np.dot(xx_inv, np.dot(xox, xx_inv)) + +def _xx_time_effects(x, y): + """ + Returns X'X - (X'T) (T'T)^-1 (T'X) + """ + # X'X + xx = np.dot(x.values.T, x.values) + xt = x.sum(level=0).values + + count = y.unstack().count(1).values + selector = count > 0 + + # X'X - (T'T)^-1 (T'X) + xt = xt[selector] + count = count[selector] + + return xx - np.dot(xt.T / count, xt) + + diff --git a/pandas/stats/tests/__init__.py b/pandas/stats/tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/pandas/stats/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/pandas/stats/tests/common.py b/pandas/stats/tests/common.py new file mode 100644 index 00000000..b2060d30 --- /dev/null +++ b/pandas/stats/tests/common.py @@ -0,0 +1,156 @@ +# pylint: disable-msg=W0611,W0402 + +from datetime import datetime +import string +import unittest +import nose + +import numpy as np + +from pandas import DataFrame, bdate_range +from pandas.util.testing import assert_almost_equal # imported in other tests +N = 100 +K = 4 + +start = datetime(2007, 1, 1) +DATE_RANGE = bdate_range(start, periods=N) + +COLS = ['Col' + c for c in string.ascii_uppercase[:K]] + +def makeDataFrame(): + data = DataFrame(np.random.randn(N, K), + columns=COLS, + index=DATE_RANGE) + + return data + +def getBasicDatasets(): + A = makeDataFrame() + B = makeDataFrame() + C = makeDataFrame() + + return A, B, C + +def check_for_scipy(): + try: + import scipy + except ImportError: + raise nose.SkipTest('no scipy') + +def check_for_statsmodels(): + _have_statsmodels = True + try: + import statsmodels.api as sm + except ImportError: + try: + import scikits.statsmodels.api as sm + except ImportError: + raise nose.SkipTest('no statsmodels') + + +class BaseTest(unittest.TestCase): + def setUp(self): + check_for_scipy() + check_for_statsmodels() + + + self.A, self.B, self.C = getBasicDatasets() + + self.createData1() + self.createData2() + self.createData3() + + def createData1(self): + date = datetime(2007, 1, 1) + date2 = datetime(2007, 1, 15) + date3 = datetime(2007, 1, 22) + + A = self.A.copy() + B = self.B.copy() + C = self.C.copy() + + A['ColA'][date] = np.NaN + B['ColA'][date] = np.NaN + C['ColA'][date] = np.NaN + C['ColA'][date2] = np.NaN + + # truncate data to save time + A = A[:30] + B = B[:30] + C = C[:30] + + self.panel_y = A + self.panel_x = {'B' : B, 'C' : C} + + self.series_panel_y = A.filter(['ColA']) + self.series_panel_x = {'B' : B.filter(['ColA']), + 'C' : C.filter(['ColA'])} + self.series_y = A['ColA'] + self.series_x = {'B' : B['ColA'], + 'C' : C['ColA']} + + def createData2(self): + y_data = [[1, np.NaN], + [2, 3], + [4, 5]] + y_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3)] + y_cols = ['A', 'B'] + self.panel_y2 = DataFrame(np.array(y_data), index=y_index, + columns=y_cols) + + x1_data = [[6, np.NaN], + [7, 8], + [9, 30], + [11, 12]] + x1_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4)] + x1_cols = ['A', 'B'] + x1 = DataFrame(np.array(x1_data), index=x1_index, + columns=x1_cols) + + x2_data = [[13, 14, np.NaN], + [15, np.NaN, np.NaN], + [16, 17, 48], + [19, 20, 21], + [22, 23, 24]] + x2_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2), + datetime(2000, 1, 3), + datetime(2000, 1, 4), + datetime(2000, 1, 5)] + x2_cols = ['C', 'A', 'B'] + x2 = DataFrame(np.array(x2_data), index=x2_index, + columns=x2_cols) + + self.panel_x2 = {'x1' : x1, 'x2' : x2} + + def createData3(self): + y_data = [[1, 2], + [3, 4]] + y_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2)] + y_cols = ['A', 'B'] + self.panel_y3 = DataFrame(np.array(y_data), index=y_index, + columns=y_cols) + + x1_data = [['A', 'B'], + ['C', 'A']] + x1_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2)] + x1_cols = ['A', 'B'] + x1 = DataFrame(np.array(x1_data), index=x1_index, + columns=x1_cols) + + x2_data = [['foo', 'bar'], + ['baz', 'foo']] + x2_index = [datetime(2000, 1, 1), + datetime(2000, 1, 2)] + x2_cols = ['A', 'B'] + x2 = DataFrame(np.array(x2_data), index=x2_index, + columns=x2_cols) + + self.panel_x3 = {'x1' : x1, 'x2' : x2} diff --git a/pandas/stats/tests/test_fama_macbeth.py b/pandas/stats/tests/test_fama_macbeth.py new file mode 100644 index 00000000..f48dde20 --- /dev/null +++ b/pandas/stats/tests/test_fama_macbeth.py @@ -0,0 +1,61 @@ +from pandas import DataFrame, Panel +from pandas.stats.api import fama_macbeth +from common import assert_almost_equal, BaseTest + +import numpy as np + +class TestFamaMacBeth(BaseTest): + def testFamaMacBethRolling(self): + # self.checkFamaMacBethExtended('rolling', self.panel_x, self.panel_y, + # nw_lags_beta=2) + + # df = DataFrame(np.random.randn(50, 10)) + x = dict((k, DataFrame(np.random.randn(50, 10))) for k in 'abcdefg') + x = Panel.from_dict(x) + y = (DataFrame(np.random.randn(50, 10)) + + DataFrame(0.01 * np.random.randn(50, 10))) + self.checkFamaMacBethExtended('rolling', x, y, nw_lags_beta=2) + self.checkFamaMacBethExtended('expanding', x, y, nw_lags_beta=2) + + def checkFamaMacBethExtended(self, window_type, x, y, **kwds): + window = 25 + + result = fama_macbeth(y=y, x=x, window_type=window_type, window=window, + **kwds) + self._check_stuff_works(result) + + index = result._index + time = len(index) + + for i in xrange(time - window + 1): + if window_type == 'rolling': + start = index[i] + else: + start = index[0] + + end = index[i + window - 1] + + x2 = {} + for k, v in x.iteritems(): + x2[k] = v.truncate(start, end) + y2 = y.truncate(start, end) + + reference = fama_macbeth(y=y2, x=x2, **kwds) + assert_almost_equal(reference._stats, result._stats[:, i]) + + static = fama_macbeth(y=y2, x=x2, **kwds) + self._check_stuff_works(static) + + def _check_stuff_works(self, result): + # does it work? + attrs = ['mean_beta', 'std_beta', 't_stat'] + for attr in attrs: + getattr(result, attr) + + # does it work? + result.summary + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_math.py b/pandas/stats/tests/test_math.py new file mode 100644 index 00000000..6553023f --- /dev/null +++ b/pandas/stats/tests/test_math.py @@ -0,0 +1,67 @@ +import unittest +import nose + +from datetime import datetime +from numpy.random import randn +import numpy as np + +from pandas.core.api import Series, DataFrame, date_range +from pandas.util.testing import assert_almost_equal +import pandas.core.datetools as datetools +import pandas.stats.moments as mom +import pandas.util.testing as tm +import pandas.stats.math as pmath +import pandas.tests.test_series as ts +from pandas import ols + +N, K = 100, 10 + +_have_statsmodels = True +try: + import statsmodels.api as sm +except ImportError: + try: + import scikits.statsmodels.api as sm + except ImportError: + _have_statsmodels = False + +class TestMath(unittest.TestCase): + + _nan_locs = np.arange(20, 40) + _inf_locs = np.array([]) + + def setUp(self): + arr = randn(N) + arr[self._nan_locs] = np.NaN + + self.arr = arr + self.rng = date_range(datetime(2009, 1, 1), periods=N) + + self.series = Series(arr.copy(), index=self.rng) + + self.frame = DataFrame(randn(N, K), index=self.rng, + columns=np.arange(K)) + + def test_rank_1d(self): + self.assertEqual(1, pmath.rank(self.series)) + self.assertEqual(0, pmath.rank(Series(0, self.series.index))) + + def test_solve_rect(self): + if not _have_statsmodels: + raise nose.SkipTest + + b = Series(np.random.randn(N), self.frame.index) + result = pmath.solve(self.frame, b) + expected = ols(y=b, x=self.frame, intercept=False).beta + self.assert_(np.allclose(result, expected)) + + def test_inv_illformed(self): + singular = DataFrame(np.array([[1, 1], [2, 2]])) + rs = pmath.inv(singular) + expected = np.array([[0.1, 0.2], [0.1, 0.2]]) + self.assert_(np.allclose(rs, expected)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py new file mode 100644 index 00000000..ff13385e --- /dev/null +++ b/pandas/stats/tests/test_moments.py @@ -0,0 +1,340 @@ +import unittest +import nose +import sys +import functools + +from datetime import datetime +from numpy.random import randn +import numpy as np + +from pandas import Series, DataFrame, bdate_range +from pandas.util.testing import assert_almost_equal, assert_series_equal +import pandas.core.datetools as datetools +import pandas.stats.moments as mom +import pandas.util.testing as tm + +N, K = 100, 10 + +class TestMoments(unittest.TestCase): + + _nan_locs = np.arange(20, 40) + _inf_locs = np.array([]) + + def setUp(self): + arr = randn(N) + arr[self._nan_locs] = np.NaN + + self.arr = arr + self.rng = bdate_range(datetime(2009, 1, 1), periods=N) + + self.series = Series(arr.copy(), index=self.rng) + + self.frame = DataFrame(randn(N, K), index=self.rng, + columns=np.arange(K)) + + def test_rolling_sum(self): + self._check_moment_func(mom.rolling_sum, np.sum) + + def test_rolling_count(self): + counter = lambda x: np.isfinite(x).astype(float).sum() + self._check_moment_func(mom.rolling_count, counter, + has_min_periods=False, + preserve_nan=False) + + def test_rolling_mean(self): + self._check_moment_func(mom.rolling_mean, np.mean) + + def test_rolling_median(self): + self._check_moment_func(mom.rolling_median, np.median) + + def test_rolling_min(self): + self._check_moment_func(mom.rolling_min, np.min) + + def test_rolling_max(self): + self._check_moment_func(mom.rolling_max, np.max) + + def test_rolling_quantile(self): + qs = [.1, .5, .9] + + def scoreatpercentile(a, per): + values = np.sort(a,axis=0) + + idx = per /1. * (values.shape[0] - 1) + return values[int(idx)] + + for q in qs: + def f(x, window, min_periods=None, freq=None): + return mom.rolling_quantile(x, window, q, + min_periods=min_periods, + freq=freq) + def alt(x): + return scoreatpercentile(x, q) + + self._check_moment_func(f, alt) + + def test_rolling_apply(self): + ser = Series([]) + assert_series_equal(ser, mom.rolling_apply(ser, 10, lambda x:x.mean())) + + def roll_mean(x, window, min_periods=None, freq=None): + return mom.rolling_apply(x, window, + lambda x: x[np.isfinite(x)].mean(), + min_periods=min_periods, + freq=freq) + self._check_moment_func(roll_mean, np.mean) + + def test_rolling_std(self): + self._check_moment_func(mom.rolling_std, + lambda x: np.std(x, ddof=1)) + self._check_moment_func(functools.partial(mom.rolling_std, ddof=0), + lambda x: np.std(x, ddof=0)) + + def test_rolling_var(self): + self._check_moment_func(mom.rolling_var, + lambda x: np.var(x, ddof=1)) + self._check_moment_func(functools.partial(mom.rolling_var, ddof=0), + lambda x: np.var(x, ddof=0)) + + def test_rolling_skew(self): + try: + from scipy.stats import skew + except ImportError: + raise nose.SkipTest('no scipy') + self._check_moment_func(mom.rolling_skew, + lambda x: skew(x, bias=False)) + + def test_rolling_kurt(self): + try: + from scipy.stats import kurtosis + except ImportError: + raise nose.SkipTest('no scipy') + self._check_moment_func(mom.rolling_kurt, + lambda x: kurtosis(x, bias=False)) + + def _check_moment_func(self, func, static_comp, window=50, + has_min_periods=True, + has_time_rule=True, + preserve_nan=True): + + self._check_ndarray(func, static_comp, window=window, + has_min_periods=has_min_periods, + preserve_nan=preserve_nan) + + self._check_structures(func, static_comp, + has_min_periods=has_min_periods, + has_time_rule=has_time_rule) + + def _check_ndarray(self, func, static_comp, window=50, + has_min_periods=True, + preserve_nan=True): + + result = func(self.arr, window) + assert_almost_equal(result[-1], + static_comp(self.arr[-50:])) + + if preserve_nan: + assert(np.isnan(result[self._nan_locs]).all()) + + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + + if has_min_periods: + result = func(arr, 50, min_periods=30) + assert_almost_equal(result[-1], static_comp(arr[10:-10])) + + # min_periods is working correctly + result = func(arr, 20, min_periods=15) + self.assert_(np.isnan(result[23])) + self.assert_(not np.isnan(result[24])) + + self.assert_(not np.isnan(result[-6])) + self.assert_(np.isnan(result[-5])) + + # min_periods=0 + result0 = func(arr, 20, min_periods=0) + result1 = func(arr, 20, min_periods=1) + assert_almost_equal(result0, result1) + else: + result = func(arr, 50) + assert_almost_equal(result[-1], static_comp(arr[10:-10])) + + def _check_structures(self, func, static_comp, + has_min_periods=True, has_time_rule=True): + + series_result = func(self.series, 50) + self.assert_(isinstance(series_result, Series)) + + frame_result = func(self.frame, 50) + self.assertEquals(type(frame_result), DataFrame) + + # check time_rule works + if has_time_rule: + win = 25 + minp = 10 + + if has_min_periods: + series_result = func(self.series[::2], win, min_periods=minp, + freq='B') + frame_result = func(self.frame[::2], win, min_periods=minp, + freq='B') + else: + series_result = func(self.series[::2], win, freq='B') + frame_result = func(self.frame[::2], win, freq='B') + + last_date = series_result.index[-1] + prev_date = last_date - 24 * datetools.bday + + trunc_series = self.series[::2].truncate(prev_date, last_date) + trunc_frame = self.frame[::2].truncate(prev_date, last_date) + + assert_almost_equal(series_result[-1], static_comp(trunc_series)) + + assert_almost_equal(frame_result.xs(last_date), + trunc_frame.apply(static_comp)) + + def test_legacy_time_rule_arg(self): + from StringIO import StringIO + # suppress deprecation warnings + sys.stderr = StringIO() + + rng = bdate_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + ts = ts.take(np.random.permutation(len(ts))[:12]).sort_index() + + try: + result = mom.rolling_mean(ts, 1, min_periods=1, freq='B') + expected = mom.rolling_mean(ts, 1, min_periods=1, + time_rule='WEEKDAY') + tm.assert_series_equal(result, expected) + + result = mom.ewma(ts, span=5, freq='B') + expected = mom.ewma(ts, span=5, time_rule='WEEKDAY') + tm.assert_series_equal(result, expected) + + finally: + sys.stderr = sys.__stderr__ + + def test_ewma(self): + self._check_ew(mom.ewma) + + def test_ewmvar(self): + self._check_ew(mom.ewmvar) + + def test_ewmvol(self): + self._check_ew(mom.ewmvol) + + def test_ewma_span_com_args(self): + A = mom.ewma(self.arr, com=9.5) + B = mom.ewma(self.arr, span=20) + assert_almost_equal(A, B) + + self.assertRaises(Exception, mom.ewma, self.arr, com=9.5, span=20) + self.assertRaises(Exception, mom.ewma, self.arr) + + def _check_ew(self, func): + self._check_ew_ndarray(func) + self._check_ew_structures(func) + + def _check_ew_ndarray(self, func, preserve_nan=False): + result = func(self.arr, com=10) + if preserve_nan: + assert(np.isnan(result[self._nan_locs]).all()) + + # excluding NaNs correctly + arr = randn(50) + arr[:10] = np.NaN + arr[-10:] = np.NaN + + # ??? check something + + # pass in ints + result2 = func(np.arange(50), span=10) + self.assert_(result2.dtype == np.float_) + + def _check_ew_structures(self, func): + series_result = func(self.series, com=10) + self.assert_(isinstance(series_result, Series)) + frame_result = func(self.frame, com=10) + self.assertEquals(type(frame_result), DataFrame) + + # binary moments + def test_rolling_cov(self): + A = self.series + B = A + randn(len(A)) + + result = mom.rolling_cov(A, B, 50, min_periods=25) + assert_almost_equal(result[-1], np.cov(A[-50:], B[-50:])[0, 1]) + + def test_rolling_corr(self): + A = self.series + B = A + randn(len(A)) + + result = mom.rolling_corr(A, B, 50, min_periods=25) + assert_almost_equal(result[-1], np.corrcoef(A[-50:], B[-50:])[0, 1]) + + # test for correct bias correction + a = tm.makeTimeSeries() + b = tm.makeTimeSeries() + a[:5] = np.nan + b[:10] = np.nan + + result = mom.rolling_corr(a, b, len(a), min_periods=1) + assert_almost_equal(result[-1], a.corr(b)) + + def test_rolling_corr_pairwise(self): + panel = mom.rolling_corr_pairwise(self.frame, 10, min_periods=5) + + correl = panel.ix[:, 1, 5] + exp = mom.rolling_corr(self.frame[1], self.frame[5], + 10, min_periods=5) + tm.assert_series_equal(correl, exp) + + def test_flex_binary_frame(self): + def _check(method): + series = self.frame[1] + + res = method(series, self.frame, 10) + res2 = method(self.frame, series, 10) + exp = self.frame.apply(lambda x: method(series, x, 10)) + + tm.assert_frame_equal(res, exp) + tm.assert_frame_equal(res2, exp) + + frame2 = self.frame.copy() + frame2.values[:] = np.random.randn(*frame2.shape) + + res3 = method(self.frame, frame2, 10) + exp = DataFrame(dict((k, method(self.frame[k], frame2[k], 10)) + for k in self.frame)) + tm.assert_frame_equal(res3, exp) + + methods = [mom.rolling_corr, mom.rolling_cov] + for meth in methods: + _check(meth) + + def test_ewmcov(self): + self._check_binary_ew(mom.ewmcov) + + def test_ewmcorr(self): + self._check_binary_ew(mom.ewmcorr) + + def _check_binary_ew(self, func): + A = Series(randn(50), index=np.arange(50)) + B = A[2:] + randn(48) + + A[:10] = np.NaN + B[-10:] = np.NaN + + result = func(A, B, 20, min_periods=5) + + self.assert_(np.isnan(result.values[:15]).all()) + self.assert_(not np.isnan(result.values[15:]).any()) + + self.assertRaises(Exception, func, A, randn(50), 20, min_periods=5) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_ols.py b/pandas/stats/tests/test_ols.py new file mode 100644 index 00000000..8877c4e6 --- /dev/null +++ b/pandas/stats/tests/test_ols.py @@ -0,0 +1,833 @@ +""" +Unit test suite for OLS and PanelOLS classes +""" + +# pylint: disable-msg=W0212 + +from __future__ import division + +from datetime import datetime +import unittest +import nose +import numpy as np + +from pandas import date_range, bdate_range +from pandas.core.panel import Panel +from pandas import DataFrame, Index, Series, notnull, datetools +from pandas.stats.api import ols +from pandas.stats.ols import _filter_data +from pandas.stats.plm import NonPooledPanelOLS, PanelOLS +from pandas.util.testing import (assert_almost_equal, assert_series_equal, + assert_frame_equal) +import pandas.util.testing as tm + +from common import BaseTest + +_have_statsmodels = True +try: + import statsmodels.api as sm +except ImportError: + try: + import scikits.statsmodels.api as sm + except ImportError: + _have_statsmodels = False + +def _check_repr(obj): + repr(obj) + str(obj) + +def _compare_ols_results(model1, model2): + assert(type(model1) == type(model2)) + + if hasattr(model1, '_window_type'): + _compare_moving_ols(model1, model2) + else: + _compare_fullsample_ols(model1, model2) + +def _compare_fullsample_ols(model1, model2): + assert_series_equal(model1.beta, model2.beta) + +def _compare_moving_ols(model1, model2): + assert_frame_equal(model1.beta, model2.beta) + +class TestOLS(BaseTest): + + # TODO: Add tests for OLS y predict + # TODO: Right now we just check for consistency between full-sample and + # rolling/expanding results of the panel OLS. We should also cross-check + # with trusted implementations of panel OLS (e.g. R). + # TODO: Add tests for non pooled OLS. + + @classmethod + def setUpClass(cls): + try: + import matplotlib as mpl + mpl.use('Agg', warn=False) + except ImportError: + pass + + if not _have_statsmodels: + raise nose.SkipTest + + def testOLSWithDatasets(self): + self.checkDataSet(sm.datasets.ccard.load(), skip_moving=True) + self.checkDataSet(sm.datasets.cpunish.load(), skip_moving=True) + self.checkDataSet(sm.datasets.longley.load(), skip_moving=True) + self.checkDataSet(sm.datasets.stackloss.load(), skip_moving=True) + self.checkDataSet(sm.datasets.copper.load()) + self.checkDataSet(sm.datasets.scotland.load()) + + # degenerate case fails on some platforms + # self.checkDataSet(datasets.ccard.load(), 39, 49) # one col in X all 0s + + def testWLS(self): + X = DataFrame(np.random.randn(30, 4), columns=['A', 'B', 'C', 'D']) + Y = Series(np.random.randn(30)) + weights = X.std(1) + + self._check_wls(X, Y, weights) + + weights.ix[[5, 15]] = np.nan + Y[[2, 21]] = np.nan + self._check_wls(X, Y, weights) + + def _check_wls(self, x, y, weights): + result = ols(y=y, x=x, weights=1/weights) + + combined = x.copy() + combined['__y__'] = y + combined['__weights__'] = weights + combined = combined.dropna() + + endog = combined.pop('__y__').values + aweights = combined.pop('__weights__').values + exog = sm.add_constant(combined.values, prepend=False) + + sm_result = sm.WLS(endog, exog, weights=1/aweights).fit() + + assert_almost_equal(sm_result.params, result._beta_raw) + assert_almost_equal(sm_result.resid, result._resid_raw) + + self.checkMovingOLS('rolling', x, y, weights=weights) + self.checkMovingOLS('expanding', x, y, weights=weights) + + def checkDataSet(self, dataset, start=None, end=None, skip_moving=False): + exog = dataset.exog[start : end] + endog = dataset.endog[start : end] + x = DataFrame(exog, index=np.arange(exog.shape[0]), + columns=np.arange(exog.shape[1])) + y = Series(endog, index=np.arange(len(endog))) + + self.checkOLS(exog, endog, x, y) + + if not skip_moving: + self.checkMovingOLS('rolling', x, y) + self.checkMovingOLS('rolling', x, y, nw_lags=0) + self.checkMovingOLS('expanding', x, y, nw_lags=0) + self.checkMovingOLS('rolling', x, y, nw_lags=1) + self.checkMovingOLS('expanding', x, y, nw_lags=1) + self.checkMovingOLS('expanding', x, y, nw_lags=1, nw_overlap=True) + + def checkOLS(self, exog, endog, x, y): + reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit() + result = ols(y=y, x=x) + + # check that sparse version is the same + sparse_result = ols(y=y.to_sparse(), x=x.to_sparse()) + _compare_ols_results(result, sparse_result) + + assert_almost_equal(reference.params, result._beta_raw) + assert_almost_equal(reference.df_model, result._df_model_raw) + assert_almost_equal(reference.df_resid, result._df_resid_raw) + assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) + assert_almost_equal(reference.pvalues, result._p_value_raw) + assert_almost_equal(reference.rsquared, result._r2_raw) + assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) + assert_almost_equal(reference.resid, result._resid_raw) + assert_almost_equal(reference.bse, result._std_err_raw) + assert_almost_equal(reference.tvalues, result._t_stat_raw) + assert_almost_equal(reference.cov_params(), result._var_beta_raw) + assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) + + _check_non_raw_results(result) + + def checkMovingOLS(self, window_type, x, y, weights=None, **kwds): + window = sm.tools.tools.rank(x.values) * 2 + + moving = ols(y=y, x=x, weights=weights, window_type=window_type, + window=window, **kwds) + + # check that sparse version is the same + sparse_moving = ols(y=y.to_sparse(), x=x.to_sparse(), + weights=weights, + window_type=window_type, + window=window, **kwds) + _compare_ols_results(moving, sparse_moving) + + index = moving._index + + for n, i in enumerate(moving._valid_indices): + if window_type == 'rolling' and i >= window: + prior_date = index[i - window + 1] + else: + prior_date = index[0] + + date = index[i] + + x_iter = {} + for k, v in x.iteritems(): + x_iter[k] = v.truncate(before=prior_date, after=date) + y_iter = y.truncate(before=prior_date, after=date) + + static = ols(y=y_iter, x=x_iter, weights=weights, **kwds) + + self.compare(static, moving, event_index=i, + result_index=n) + + _check_non_raw_results(moving) + + FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', 'p_value', + 'r2', 'r2_adj', 'rmse', 'std_err', 't_stat', + 'var_beta'] + + def compare(self, static, moving, event_index=None, + result_index=None): + + index = moving._index + + # Check resid if we have a time index specified + if event_index is not None: + ref = static._resid_raw[-1] + + label = index[event_index] + + res = moving.resid[label] + + assert_almost_equal(ref, res) + + ref = static._y_fitted_raw[-1] + res = moving.y_fitted[label] + + assert_almost_equal(ref, res) + + # Check y_fitted + + for field in self.FIELDS: + attr = '_%s_raw' % field + + ref = getattr(static, attr) + res = getattr(moving, attr) + + if result_index is not None: + res = res[result_index] + + assert_almost_equal(ref, res) + + def test_ols_object_dtype(self): + df = DataFrame(np.random.randn(20, 2), dtype=object) + model = ols(y=df[0], x=df[1]) + summary = repr(model) + +class TestOLSMisc(unittest.TestCase): + ''' + For test coverage with faux data + ''' + @classmethod + def setupClass(cls): + if not _have_statsmodels: + raise nose.SkipTest + + def test_f_test(self): + x = tm.makeTimeDataFrame() + y = x.pop('A') + + model = ols(y=y, x=x) + + hyp = '1*B+1*C+1*D=0' + result = model.f_test(hyp) + + hyp = ['1*B=0', + '1*C=0', + '1*D=0'] + result = model.f_test(hyp) + assert_almost_equal(result['f-stat'], model.f_stat['f-stat']) + + self.assertRaises(Exception, model.f_test, '1*A=0') + + def test_r2_no_intercept(self): + y = tm.makeTimeSeries() + x = tm.makeTimeDataFrame() + + x_with = x.copy() + x_with['intercept'] = 1. + + model1 = ols(y=y, x=x) + model2 = ols(y=y, x=x_with, intercept=False) + assert_series_equal(model1.beta, model2.beta) + + # TODO: can we infer whether the intercept is there... + self.assert_(model1.r2 != model2.r2) + + # rolling + + model1 = ols(y=y, x=x, window=20) + model2 = ols(y=y, x=x_with, window=20, intercept=False) + assert_frame_equal(model1.beta, model2.beta) + self.assert_((model1.r2 != model2.r2).all()) + + def test_summary_many_terms(self): + x = DataFrame(np.random.randn(100, 20)) + y = np.random.randn(100) + model = ols(y=y, x=x) + model.summary + + def test_y_predict(self): + y = tm.makeTimeSeries() + x = tm.makeTimeDataFrame() + model1 = ols(y=y, x=x) + assert_series_equal(model1.y_predict, model1.y_fitted) + assert_almost_equal(model1._y_predict_raw, model1._y_fitted_raw) + + def test_predict(self): + y = tm.makeTimeSeries() + x = tm.makeTimeDataFrame() + model1 = ols(y=y, x=x) + assert_series_equal(model1.predict(), model1.y_predict) + assert_series_equal(model1.predict(x=x), model1.y_predict) + assert_series_equal(model1.predict(beta=model1.beta), model1.y_predict) + + exog = x.copy() + exog['intercept'] = 1. + rs = Series(np.dot(exog.values, model1.beta.values), x.index) + assert_series_equal(model1.y_predict, rs) + + x2 = x.reindex(columns=x.columns[::-1]) + assert_series_equal(model1.predict(x=x2), model1.y_predict) + + x3 = x2 + 10 + pred3 = model1.predict(x=x3) + x3['intercept'] = 1. + x3 = x3.reindex(columns = model1.beta.index) + expected = Series(np.dot(x3.values, model1.beta.values), x3.index) + assert_series_equal(expected, pred3) + + beta = Series(0., model1.beta.index) + pred4 = model1.predict(beta=beta) + assert_series_equal(Series(0., pred4.index), pred4) + + def test_predict_longer_exog(self): + exogenous = {"1998": "4760","1999": "5904","2000": "4504", + "2001": "9808","2002": "4241","2003": "4086", + "2004": "4687","2005": "7686","2006": "3740", + "2007": "3075","2008": "3753","2009": "4679", + "2010": "5468","2011": "7154","2012": "4292", + "2013": "4283","2014": "4595","2015": "9194", + "2016": "4221","2017": "4520"} + endogenous = {"1998": "691", "1999": "1580", "2000": "80", + "2001": "1450", "2002": "555", "2003": "956", + "2004": "877", "2005": "614", "2006": "468", + "2007": "191"} + + endog = Series(endogenous) + exog = Series(exogenous) + model = ols(y=endog, x=exog) + + pred = model.y_predict + self.assert_(pred.index.equals(exog.index)) + + def test_longpanel_series_combo(self): + wp = tm.makePanel() + lp = wp.to_frame() + + y = lp.pop('ItemA') + model = ols(y=y, x=lp, entity_effects=True, window=20) + self.assert_(notnull(model.beta.values).all()) + self.assert_(isinstance(model, PanelOLS)) + model.summary + + def test_series_rhs(self): + y = tm.makeTimeSeries() + x = tm.makeTimeSeries() + model = ols(y=y, x=x) + expected = ols(y=y, x={'x' : x}) + assert_series_equal(model.beta, expected.beta) + + def test_various_attributes(self): + # just make sure everything "works". test correctness elsewhere + + x = DataFrame(np.random.randn(100, 5)) + y = np.random.randn(100) + model = ols(y=y, x=x, window=20) + + series_attrs = ['rank', 'df', 'forecast_mean', 'forecast_vol'] + + for attr in series_attrs: + value = getattr(model, attr) + self.assert_(isinstance(value, Series)) + + # works + model._results + + def test_catch_regressor_overlap(self): + df1 = tm.makeTimeDataFrame().ix[:, ['A', 'B']] + df2 = tm.makeTimeDataFrame().ix[:, ['B', 'C', 'D']] + y = tm.makeTimeSeries() + + data = {'foo' : df1, 'bar' : df2} + self.assertRaises(Exception, ols, y=y, x=data) + + def test_plm_ctor(self): + y = tm.makeTimeDataFrame() + x = {'a' : tm.makeTimeDataFrame(), + 'b' : tm.makeTimeDataFrame()} + + model = ols(y=y, x=x, intercept=False) + model.summary + + model = ols(y=y, x=Panel(x)) + model.summary + + def test_plm_attrs(self): + y = tm.makeTimeDataFrame() + x = {'a' : tm.makeTimeDataFrame(), + 'b' : tm.makeTimeDataFrame()} + + rmodel = ols(y=y, x=x, window=10) + model = ols(y=y, x=x) + model.resid + rmodel.resid + + def test_plm_lagged_y_predict(self): + y = tm.makeTimeDataFrame() + x = {'a' : tm.makeTimeDataFrame(), + 'b' : tm.makeTimeDataFrame()} + + model = ols(y=y, x=x, window=10) + result = model.lagged_y_predict(2) + + def test_plm_f_test(self): + y = tm.makeTimeDataFrame() + x = {'a' : tm.makeTimeDataFrame(), + 'b' : tm.makeTimeDataFrame()} + + model = ols(y=y, x=x) + + hyp = '1*a+1*b=0' + result = model.f_test(hyp) + + hyp = ['1*a=0', + '1*b=0'] + result = model.f_test(hyp) + assert_almost_equal(result['f-stat'], model.f_stat['f-stat']) + + def test_plm_exclude_dummy_corner(self): + y = tm.makeTimeDataFrame() + x = {'a' : tm.makeTimeDataFrame(), + 'b' : tm.makeTimeDataFrame()} + + model = ols(y=y, x=x, entity_effects=True, dropped_dummies={'entity' : 'D'}) + model.summary + + self.assertRaises(Exception, ols, y=y, x=x, entity_effects=True, + dropped_dummies={'entity' : 'E'}) + +class TestPanelOLS(BaseTest): + + FIELDS = ['beta', 'df', 'df_model', 'df_resid', 'f_stat', + 'p_value', 'r2', 'r2_adj', 'rmse', 'std_err', + 't_stat', 'var_beta'] + + _other_fields = ['resid', 'y_fitted'] + + def testFiltering(self): + result = ols(y=self.panel_y2, x=self.panel_x2) + + x = result._x + index = x.index.get_level_values(0) + index = Index(sorted(set(index))) + exp_index = Index([datetime(2000, 1, 1), datetime(2000, 1, 3)]) + self.assertTrue;(exp_index.equals(index)) + + index = x.index.get_level_values(1) + index = Index(sorted(set(index))) + exp_index = Index(['A', 'B']) + self.assertTrue(exp_index.equals(index)) + + x = result._x_filtered + index = x.index.get_level_values(0) + index = Index(sorted(set(index))) + exp_index = Index([datetime(2000, 1, 1), + datetime(2000, 1, 3), + datetime(2000, 1, 4)]) + self.assertTrue(exp_index.equals(index)) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + + exp_x = [[6, 14, 1], + [9, 17, 1], + [30, 48, 1]] + assert_almost_equal(exp_x, result._x.values) + + exp_x_filtered = [[6, 14, 1], + [9, 17, 1], + [30, 48, 1], + [11, 20, 1], + [12, 21, 1]] + assert_almost_equal(exp_x_filtered, result._x_filtered.values) + + self.assertTrue(result._x_filtered.index.levels[0].equals( + result.y_fitted.index)) + + def test_wls_panel(self): + y = tm.makeTimeDataFrame() + x = Panel({'x1' : tm.makeTimeDataFrame(), + 'x2' : tm.makeTimeDataFrame()}) + + y.ix[[1, 7], 'A'] = np.nan + y.ix[[6, 15], 'B'] = np.nan + y.ix[[3, 20], 'C'] = np.nan + y.ix[[5, 11], 'D'] = np.nan + + stack_y = y.stack() + stack_x = DataFrame(dict((k, v.stack()) + for k, v in x.iteritems())) + + weights = x.std('items') + stack_weights = weights.stack() + + stack_y.index = stack_y.index._tuple_index + stack_x.index = stack_x.index._tuple_index + stack_weights.index = stack_weights.index._tuple_index + + result = ols(y=y, x=x, weights=1/weights) + expected = ols(y=stack_y, x=stack_x, weights=1/stack_weights) + + assert_almost_equal(result.beta, expected.beta) + + for attr in ['resid', 'y_fitted']: + rvals = getattr(result, attr).stack().values + evals = getattr(expected, attr).values + assert_almost_equal(rvals, evals) + + def testWithTimeEffects(self): + result = ols(y=self.panel_y2, x=self.panel_x2, time_effects=True) + + assert_almost_equal(result._y_trans.values.flat, [0, -0.5, 0.5]) + + exp_x = [[0, 0], [-10.5, -15.5], [10.5, 15.5]] + assert_almost_equal(result._x_trans.values, exp_x) + + # _check_non_raw_results(result) + + def testWithEntityEffects(self): + result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + + exp_x = DataFrame([[0, 6, 14, 1], [0, 9, 17, 1], [1, 30, 48, 1]], + index=result._x.index, columns=['FE_B', 'x1', 'x2', + 'intercept'], + dtype=float) + tm.assert_frame_equal(result._x, exp_x.ix[:, result._x.columns]) + # _check_non_raw_results(result) + + def testWithEntityEffectsAndDroppedDummies(self): + result = ols(y=self.panel_y2, x=self.panel_x2, entity_effects=True, + dropped_dummies={'entity' : 'B'}) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + exp_x = DataFrame([[1, 6, 14, 1], [1, 9, 17, 1], [0, 30, 48, 1]], + index=result._x.index, columns=['FE_A', 'x1', 'x2', + 'intercept'], + dtype=float) + tm.assert_frame_equal(result._x, exp_x.ix[:, result._x.columns]) + # _check_non_raw_results(result) + + def testWithXEffects(self): + result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1']) + + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + + res = result._x + exp_x = DataFrame([[0, 0, 14, 1], [0, 1, 17, 1], [1, 0, 48, 1]], + columns=['x1_30', 'x1_9', 'x2', 'intercept'], + index=res.index, dtype=float) + assert_frame_equal(res, exp_x.reindex(columns=res.columns)) + + def testWithXEffectsAndDroppedDummies(self): + result = ols(y=self.panel_y2, x=self.panel_x2, x_effects=['x1'], + dropped_dummies={'x1' : 30}) + + res = result._x + assert_almost_equal(result._y.values.flat, [1, 4, 5]) + exp_x = DataFrame([[1, 0, 14, 1], [0, 1, 17, 1], [0, 0, 48, 1]], + columns=['x1_6', 'x1_9', 'x2', 'intercept'], + index=res.index, dtype=float) + + assert_frame_equal(res, exp_x.reindex(columns=res.columns)) + + def testWithXEffectsAndConversion(self): + result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2']) + + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) + exp_x = [[0, 0, 0, 1, 1], [1, 0, 0, 0, 1], [0, 1, 1, 0, 1], + [0, 0, 0, 1, 1]] + assert_almost_equal(result._x.values, exp_x) + + exp_index = Index(['x1_B', 'x1_C', 'x2_baz', 'x2_foo', 'intercept']) + self.assertTrue(exp_index.equals(result._x.columns)) + + # _check_non_raw_results(result) + + def testWithXEffectsAndConversionAndDroppedDummies(self): + result = ols(y=self.panel_y3, x=self.panel_x3, x_effects=['x1', 'x2'], + dropped_dummies={'x2' : 'foo'}) + + assert_almost_equal(result._y.values.flat, [1, 2, 3, 4]) + exp_x = [[0, 0, 0, 0, 1], [1, 0, 1, 0, 1], [0, 1, 0, 1, 1], + [0, 0, 0, 0, 1]] + assert_almost_equal(result._x.values, exp_x) + + exp_index = Index(['x1_B', 'x1_C', 'x2_bar', 'x2_baz', 'intercept']) + self.assertTrue(exp_index.equals(result._x.columns)) + + # _check_non_raw_results(result) + + def testForSeries(self): + self.checkForSeries(self.series_panel_x, self.series_panel_y, + self.series_x, self.series_y) + + self.checkForSeries(self.series_panel_x, self.series_panel_y, + self.series_x, self.series_y, nw_lags=0) + + self.checkForSeries(self.series_panel_x, self.series_panel_y, + self.series_x, self.series_y, nw_lags=1, + nw_overlap=True) + + + def testRolling(self): + self.checkMovingOLS(self.panel_x, self.panel_y) + + def testRollingWithFixedEffects(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + entity_effects=True) + self.checkMovingOLS(self.panel_x, self.panel_y, intercept=False, + entity_effects=True) + + def testRollingWithTimeEffects(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + time_effects=True) + + def testRollingWithNeweyWest(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + nw_lags=1) + + def testRollingWithEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + cluster='entity') + + def testRollingWithTimeEffectsAndEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + time_effects=True, cluster='entity') + + def testRollingWithTimeCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + cluster='time') + + def testRollingWithNeweyWestAndEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + nw_lags=1, cluster='entity') + + def testRollingWithNeweyWestAndTimeEffectsAndEntityCluster(self): + self.checkMovingOLS(self.panel_x, self.panel_y, + nw_lags=1, cluster='entity', + time_effects=True) + + def testExpanding(self): + self.checkMovingOLS(self.panel_x, self.panel_y, window_type='expanding') + + def testNonPooled(self): + self.checkNonPooled(y=self.panel_y, x=self.panel_x) + self.checkNonPooled(y=self.panel_y, x=self.panel_x, + window_type='rolling', window=25, min_periods=10) + + def checkNonPooled(self, x, y, **kwds): + # For now, just check that it doesn't crash + result = ols(y=y, x=x, pool=False, **kwds) + + _check_repr(result) + for attr in NonPooledPanelOLS.ATTRIBUTES: + _check_repr(getattr(result, attr)) + + def checkMovingOLS(self, x, y, window_type='rolling', **kwds): + window = 25 # must be larger than rank of x + + moving = ols(y=y, x=x, window_type=window_type, + window=window, **kwds) + + index = moving._index + + for n, i in enumerate(moving._valid_indices): + if window_type == 'rolling' and i >= window: + prior_date = index[i - window + 1] + else: + prior_date = index[0] + + date = index[i] + + x_iter = {} + for k, v in x.iteritems(): + x_iter[k] = v.truncate(before=prior_date, after=date) + y_iter = y.truncate(before=prior_date, after=date) + + static = ols(y=y_iter, x=x_iter, **kwds) + + self.compare(static, moving, event_index=i, + result_index=n) + + _check_non_raw_results(moving) + + def checkForSeries(self, x, y, series_x, series_y, **kwds): + # Consistency check with simple OLS. + result = ols(y=y, x=x, **kwds) + reference = ols(y=series_y, x=series_x, **kwds) + + self.compare(reference, result) + + def compare(self, static, moving, event_index=None, + result_index=None): + + # Check resid if we have a time index specified + if event_index is not None: + staticSlice = _period_slice(static, -1) + movingSlice = _period_slice(moving, event_index) + + ref = static._resid_raw[staticSlice] + res = moving._resid_raw[movingSlice] + + assert_almost_equal(ref, res) + + ref = static._y_fitted_raw[staticSlice] + res = moving._y_fitted_raw[movingSlice] + + assert_almost_equal(ref, res) + + # Check y_fitted + + for field in self.FIELDS: + attr = '_%s_raw' % field + + ref = getattr(static, attr) + res = getattr(moving, attr) + + if result_index is not None: + res = res[result_index] + + assert_almost_equal(ref, res) + + def test_auto_rolling_window_type(self): + data = tm.makeTimeDataFrame() + y = data.pop('A') + + window_model = ols(y=y, x=data, window=20, min_periods=10) + rolling_model = ols(y=y, x=data, window=20, min_periods=10, + window_type='rolling') + + assert_frame_equal(window_model.beta, rolling_model.beta) + +def _check_non_raw_results(model): + _check_repr(model) + _check_repr(model.resid) + _check_repr(model.summary_as_matrix) + _check_repr(model.y_fitted) + _check_repr(model.y_predict) + +def _period_slice(panelModel, i): + index = panelModel._x_trans.index + period = index.levels[0][i] + + L, R = index.get_major_bounds(period, period) + + return slice(L, R) + +class TestOLSFilter(unittest.TestCase): + + def setUp(self): + date_index = date_range(datetime(2009, 12, 11), periods=3, + freq=datetools.bday) + ts = Series([3, 1, 4], index=date_index) + self.TS1 = ts + + date_index = date_range(datetime(2009, 12, 11), periods=5, + freq=datetools.bday) + ts = Series([1, 5, 9, 2, 6], index=date_index) + self.TS2 = ts + + date_index = date_range(datetime(2009, 12, 11), periods=3, + freq=datetools.bday) + ts = Series([5, np.nan, 3], index=date_index) + self.TS3 = ts + + date_index = date_range(datetime(2009, 12, 11), periods=5, + freq=datetools.bday) + ts = Series([np.nan, 5, 8, 9, 7], index=date_index) + self.TS4 = ts + + data = {'x1' : self.TS2, 'x2' : self.TS4} + self.DF1 = DataFrame(data=data) + + data = {'x1' : self.TS2, 'x2' : self.TS4} + self.DICT1 = data + + def testFilterWithSeriesRHS(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS1, {'x1' : self.TS2}, None) + self.tsAssertEqual(self.TS1, lhs) + self.tsAssertEqual(self.TS2[:3], rhs['x1']) + self.tsAssertEqual(self.TS2, rhs_pre['x1']) + + def testFilterWithSeriesRHS2(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS2, {'x1' : self.TS1}, None) + self.tsAssertEqual(self.TS2[:3], lhs) + self.tsAssertEqual(self.TS1, rhs['x1']) + self.tsAssertEqual(self.TS1, rhs_pre['x1']) + + def testFilterWithSeriesRHS3(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS3, {'x1' : self.TS4}, None) + exp_lhs = self.TS3[2:3] + exp_rhs = self.TS4[2:3] + exp_rhs_pre = self.TS4[1:] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs, rhs['x1']) + self.tsAssertEqual(exp_rhs_pre, rhs_pre['x1']) + + def testFilterWithDataFrameRHS(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS1, self.DF1, None) + exp_lhs = self.TS1[1:] + exp_rhs1 = self.TS2[1:3] + exp_rhs2 = self.TS4[1:3] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs1, rhs['x1']) + self.tsAssertEqual(exp_rhs2, rhs['x2']) + + def testFilterWithDictRHS(self): + (lhs, rhs, weights, rhs_pre, + index, valid) = _filter_data(self.TS1, self.DICT1, None) + exp_lhs = self.TS1[1:] + exp_rhs1 = self.TS2[1:3] + exp_rhs2 = self.TS4[1:3] + self.tsAssertEqual(exp_lhs, lhs) + self.tsAssertEqual(exp_rhs1, rhs['x1']) + self.tsAssertEqual(exp_rhs2, rhs['x2']) + + def tsAssertEqual(self, ts1, ts2): + self.assert_(np.array_equal(ts1, ts2)) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/stats/tests/test_var.py b/pandas/stats/tests/test_var.py new file mode 100644 index 00000000..b48fc5bb --- /dev/null +++ b/pandas/stats/tests/test_var.py @@ -0,0 +1,191 @@ +from numpy.testing import run_module_suite, assert_equal, TestCase + +from pandas.util.testing import assert_almost_equal + +import nose +import unittest + +raise nose.SkipTest('skipping this for now') + +try: + import statsmodels.tsa.var as sm_var + import statsmodels as sm +except ImportError: + import scikits.statsmodels.tsa.var as sm_var + import scikits.statsmodels as sm + + +import pandas.stats.var as _pvar +reload(_pvar) +from pandas.stats.var import VAR + +try: + import rpy2.robjects as robj + from rpy2.robjects import r + from rpy2.robjects.packages import importr + import pandas.rpy.common as rpy + vars = importr('vars') + urca = importr('urca') +except ImportError: + pass + +DECIMAL_6 = 6 +DECIMAL_5 = 5 +DECIMAL_4 = 4 +DECIMAL_3 = 3 +DECIMAL_2 = 2 + +class CheckVAR(object): + def test_params(self): + assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_3) + + def test_neqs(self): + assert_equal(self.res1.neqs, self.res2.neqs) + + def test_nobs(self): + assert_equal(self.res1.avobs, self.res2.nobs) + + def test_df_eq(self): + assert_equal(self.res1.df_eq, self.res2.df_eq) + + def test_rmse(self): + results = self.res1.results + for i in range(len(results)): + assert_almost_equal(results[i].mse_resid**.5, + eval('self.res2.rmse_'+str(i+1)), DECIMAL_6) + + def test_rsquared(self): + results = self.res1.results + for i in range(len(results)): + assert_almost_equal(results[i].rsquared, + eval('self.res2.rsquared_'+str(i+1)), DECIMAL_3) + + def test_llf(self): + results = self.res1.results + assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_2) + for i in range(len(results)): + assert_almost_equal(results[i].llf, + eval('self.res2.llf_'+str(i+1)), DECIMAL_2) + + def test_aic(self): + assert_almost_equal(self.res1.aic, self.res2.aic) + + def test_bic(self): + assert_almost_equal(self.res1.bic, self.res2.bic) + + def test_hqic(self): + assert_almost_equal(self.res1.hqic, self.res2.hqic) + + def test_fpe(self): + assert_almost_equal(self.res1.fpe, self.res2.fpe) + + def test_detsig(self): + assert_almost_equal(self.res1.detomega, self.res2.detsig) + + def test_bse(self): + assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) + + +class Foo(object): + def __init__(self): + data = sm.datasets.macrodata.load() + data = data.data[['realinv','realgdp','realcons']].view((float,3)) + data = diff(log(data),axis=0) + self.res1 = VAR2(endog=data).fit(maxlag=2) + from results import results_var + self.res2 = results_var.MacrodataResults() + + +class RVAR(object): + """ + Estimates VAR model using R vars package and rpy + """ + + def __init__(self, data, p=1, type='both'): + self.rdata = data + self.p = p + self.type = type + + self.pydata = rpy.convert_robj(data) + self._estimate = None + self.estimate() + + @property + def aic(self): + pass + + @property + def bic(self): + pass + + @property + def beta(self): + return rpy.convert_robj(r.coef(self._estimate)) + + def summary(self, equation=None): + print r.summary(self._estimate, equation=equation) + + def output(self): + print self._estimate + + def estimate(self): + self._estimate = r.VAR(self.rdata, p=self.p, type=self.type) + + def plot(self, names=None): + r.plot(model._estimate, names=names) + + def serial_test(self, lags_pt=16, type='PT.asymptotic'): + f = r['serial.test'] + + test = f(self._estimate, **{'lags.pt' : lags_pt, + 'type' : type}) + + return test + + def data_summary(self): + print r.summary(self.rdata) + +class TestVAR(TestCase): + + def setUp(self): + try: + import rpy2 + except ImportError: + raise nose.SkipTest("No rpy2") + + self.rdata = rpy.load_data('Canada', package='vars', convert=False) + self.data = rpy.load_data('Canada', package='vars', convert=True) + + self.res = VAR(self.data) + self.ref = RVAR(self.rdata) + + def test_foo(self): + pass + +if __name__ == '__main__': + # canada = rpy.load_data('Canada', package='vars', convert=False) + + # model = RVAR(canada, p=1) + + # summary(Canada) + + # plot(Canada, nc=2, xlab="")ppp + + # adf1 <- summary(ur.df(Canada[, "prod"], type = "trend", lags = 2)) + # adf1 + + # adf2 <- summary(ur.df(diff(Canada[, "prod"]), type = "drift", lags = 1)) + # adf2 + + # VARselect(Canada, lag.max = 8, type = "both") + + # Canada <- Canada[, c("prod", "e", "U", "rw")] + + # p1ct <- VAR(Canada, p = 1, type = "both") + # p1ct + + # coefs <- coef(p1ct) + # class(coefs) + + # run_module_suite() + unittest.main() diff --git a/pandas/stats/var.py b/pandas/stats/var.py new file mode 100644 index 00000000..bce0404a --- /dev/null +++ b/pandas/stats/var.py @@ -0,0 +1,586 @@ +from __future__ import division + +import numpy as np + +from pandas.util.decorators import cache_readonly +from pandas.core.frame import DataFrame +from pandas.core.panel import Panel +from pandas.core.series import Series +import pandas.stats.common as common +from pandas.stats.math import inv +from pandas.stats.ols import _combine_rhs + +class VAR(object): + """ + Estimates VAR(p) regression on multivariate time series data + presented in pandas data structures. + + Parameters + ---------- + data : DataFrame or dict of Series + p : lags to include + + """ + + def __init__(self, data, p=1, intercept=True): + try: + import statsmodels.tsa.var as sm_var + except ImportError: + import scikits.statsmodels.tsa.var as sm_var + + self._data = DataFrame(_combine_rhs(data)) + self._p = p + + self._columns = self._data.columns + self._index = self._data.index + + self._intercept = intercept + + @cache_readonly + def aic(self): + """Returns the Akaike information criterion.""" + return self._ic['aic'] + + @cache_readonly + def bic(self): + """Returns the Bayesian information criterion.""" + return self._ic['bic'] + + @cache_readonly + def beta(self): + """ + Returns a DataFrame, where each column x1 contains the betas + calculated by regressing the x1 column of the VAR input with + the lagged input. + + Returns + ------- + DataFrame + """ + d = dict([(key, value.beta) + for (key, value) in self.ols_results.iteritems()]) + return DataFrame(d) + + def forecast(self, h): + """ + Returns a DataFrame containing the forecasts for 1, 2, ..., n time + steps. Each column x1 contains the forecasts of the x1 column. + + Parameters + ---------- + n: int + Number of time steps ahead to forecast. + + Returns + ------- + DataFrame + """ + forecast = self._forecast_raw(h)[:, 0, :] + return DataFrame(forecast, index=xrange(1, 1 + h), + columns=self._columns) + + def forecast_cov(self, h): + """ + Returns the covariance of the forecast residuals. + + Returns + ------- + DataFrame + """ + return [DataFrame(value, index=self._columns, columns=self._columns) + for value in self._forecast_cov_raw(h)] + + def forecast_std_err(self, h): + """ + Returns the standard errors of the forecast residuals. + + Returns + ------- + DataFrame + """ + return DataFrame(self._forecast_std_err_raw(h), + index=xrange(1, 1 + h), columns=self._columns) + + @cache_readonly + def granger_causality(self): + """Returns the f-stats and p-values from the Granger Causality Test. + + If the data consists of columns x1, x2, x3, then we perform the + following regressions: + + x1 ~ L(x2, x3) + x1 ~ L(x1, x3) + x1 ~ L(x1, x2) + + The f-stats of these results are placed in the 'x1' column of the + returned DataFrame. We then repeat for x2, x3. + + Returns + ------- + Dict, where 'f-stat' returns the DataFrame containing the f-stats, + and 'p-value' returns the DataFrame containing the corresponding + p-values of the f-stats. + """ + from pandas.stats.api import ols + from scipy.stats import f + + d = {} + for col in self._columns: + d[col] = {} + for i in xrange(1, 1 + self._p): + lagged_data = self._lagged_data[i].filter(self._columns - [col]) + + for key, value in lagged_data.iteritems(): + d[col][_make_param_name(i, key)] = value + + f_stat_dict = {} + p_value_dict = {} + + for col, y in self._data.iteritems(): + ssr_full = (self.resid[col] ** 2).sum() + + f_stats = [] + p_values = [] + + for col2 in self._columns: + result = ols(y=y, x=d[col2]) + + resid = result.resid + ssr_reduced = (resid ** 2).sum() + + M = self._p + N = self._nobs + K = self._k * self._p + 1 + f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K)) + f_stats.append(f_stat) + + p_value = f.sf(f_stat, M, N - K) + p_values.append(p_value) + + f_stat_dict[col] = Series(f_stats, self._columns) + p_value_dict[col] = Series(p_values, self._columns) + + f_stat_mat = DataFrame(f_stat_dict) + p_value_mat = DataFrame(p_value_dict) + + return { + 'f-stat' : f_stat_mat, + 'p-value' : p_value_mat, + } + + @cache_readonly + def ols_results(self): + """ + Returns the results of the regressions: + x_1 ~ L(X) + x_2 ~ L(X) + ... + x_k ~ L(X) + + where X = [x_1, x_2, ..., x_k] + and L(X) represents the columns of X lagged 1, 2, ..., n lags + (n is the user-provided number of lags). + + Returns + ------- + dict + """ + from pandas.stats.api import ols + + d = {} + for i in xrange(1, 1 + self._p): + for col, series in self._lagged_data[i].iteritems(): + d[_make_param_name(i, col)] = series + + result = dict([(col, ols(y=y, x=d, intercept=self._intercept)) + for col, y in self._data.iteritems()]) + + return result + + @cache_readonly + def resid(self): + """ + Returns the DataFrame containing the residuals of the VAR regressions. + Each column x1 contains the residuals generated by regressing the x1 + column of the input against the lagged input. + + Returns + ------- + DataFrame + """ + d = dict([(col, series.resid) + for (col, series) in self.ols_results.iteritems()]) + return DataFrame(d, index=self._index) + + @cache_readonly + def summary(self): + template = """ +%(banner_top)s + +Number of Observations: %(nobs)d +AIC: %(aic).3f +BIC: %(bic).3f + +%(banner_coef)s +%(coef_table)s +%(banner_end)s +""" + params = { + 'banner_top' : common.banner('Summary of VAR'), + 'banner_coef' : common.banner('Summary of Estimated Coefficients'), + 'banner_end' : common.banner('End of Summary'), + 'coef_table' : self.beta, + 'aic' : self.aic, + 'bic' : self.bic, + 'nobs' : self._nobs, + } + + return template % params + + @cache_readonly + def _alpha(self): + """ + Returns array where the i-th element contains the intercept + when regressing the i-th column of self._data with the lagged data. + """ + if self._intercept: + return self._beta_raw[-1] + else: + return np.zeros(self._k) + + @cache_readonly + def _beta_raw(self): + return np.array([self.beta[col].values() for col in self._columns]).T + + def _trans_B(self, h): + """ + Returns 0, 1, ..., (h-1)-th power of transpose of B as defined in + equation (4) on p. 142 of the Stata 11 Time Series reference book. + """ + result = [np.eye(1 + self._k * self._p)] + + row1 = np.zeros((1, 1 + self._k * self._p)) + row1[0, 0] = 1 + + v = self._alpha.reshape((self._k, 1)) + row2 = np.hstack(tuple([v] + self._lag_betas)) + + m = self._k * (self._p - 1) + row3 = np.hstack(( + np.zeros((m, 1)), + np.eye(m), + np.zeros((m, self._k)) + )) + + trans_B = np.vstack((row1, row2, row3)).T + + result.append(trans_B) + + for i in xrange(2, h): + result.append(np.dot(trans_B, result[i - 1])) + + return result + + @cache_readonly + def _x(self): + values = np.array([ + self._lagged_data[i][col].values() + for i in xrange(1, 1 + self._p) + for col in self._columns + ]).T + + x = np.hstack((np.ones((len(values), 1)), values))[self._p:] + + return x + + @cache_readonly + def _cov_beta(self): + cov_resid = self._sigma + + x = self._x + + inv_cov_x = inv(np.dot(x.T, x)) + + return np.kron(inv_cov_x, cov_resid) + + def _data_xs(self, i): + """ + Returns the cross-section of the data at the given timestep. + """ + return self._data.values[i] + + def _forecast_cov_raw(self, n): + resid = self._forecast_cov_resid_raw(n) + #beta = self._forecast_cov_beta_raw(n) + + #return [a + b for a, b in izip(resid, beta)] + # TODO: ignore the beta forecast std err until it's verified + + return resid + + def _forecast_cov_beta_raw(self, n): + """ + Returns the covariance of the beta errors for the forecast at + 1, 2, ..., n timesteps. + """ + p = self._p + + values = self._data.values + T = len(values) - self._p - 1 + + results = [] + + for h in xrange(1, n + 1): + psi = self._psi(h) + trans_B = self._trans_B(h) + + sum = 0 + + cov_beta = self._cov_beta + + for t in xrange(T + 1): + index = t + p + y = values.take(xrange(index, index - p, -1), axis=0).flatten() + trans_Z = np.hstack(([1], y)) + trans_Z = trans_Z.reshape(1, len(trans_Z)) + + sum2 = 0 + for i in xrange(h): + ZB = np.dot(trans_Z, trans_B[h - 1 - i]) + + prod = np.kron(ZB, psi[i]) + sum2 = sum2 + prod + + sum = sum + chain_dot(sum2, cov_beta, sum2.T) + + results.append(sum / (T + 1)) + + return results + + def _forecast_cov_resid_raw(self, h): + """ + Returns the covariance of the residual errors for the forecast at + 1, 2, ..., h timesteps. + """ + psi_values = self._psi(h) + sum = 0 + result = [] + for i in xrange(h): + psi = psi_values[i] + sum = sum + chain_dot(psi, self._sigma, psi.T) + result.append(sum) + + return result + + def _forecast_raw(self, h): + """ + Returns the forecast at 1, 2, ..., h timesteps in the future. + """ + k = self._k + result = [] + for i in xrange(h): + sum = self._alpha.reshape(1, k) + for j in xrange(self._p): + beta = self._lag_betas[j] + idx = i - j + if idx > 0: + y = result[idx - 1] + else: + y = self._data_xs(idx - 1) + + sum = sum + np.dot(beta, y.T).T + result.append(sum) + + return np.array(result) + + def _forecast_std_err_raw(self, h): + """ + Returns the standard error of the forecasts + at 1, 2, ..., n timesteps. + """ + return np.array([np.sqrt(np.diag(value)) + for value in self._forecast_cov_raw(h)]) + + @cache_readonly + def _ic(self): + """ + Returns the Akaike/Bayesian information criteria. + """ + RSS = self._rss + k = self._p * (self._k * self._p + 1) + n = self._nobs * self._k + + return {'aic' : 2 * k + n * np.log(RSS / n), + 'bic' : n * np.log(RSS / n) + k * np.log(n)} + + @cache_readonly + def _k(self): + return len(self._columns) + + @cache_readonly + def _lag_betas(self): + """ + Returns list of B_i, where B_i represents the (k, k) matrix + with the j-th row containing the betas of regressing the j-th + column of self._data with self._data lagged i time steps. + First element is B_1, second element is B_2, etc. + """ + k = self._k + b = self._beta_raw + return [b[k * i : k * (i + 1)].T for i in xrange(self._p)] + + @cache_readonly + def _lagged_data(self): + return dict([(i, self._data.shift(i)) + for i in xrange(1, 1 + self._p)]) + + @cache_readonly + def _nobs(self): + return len(self._data) - self._p + + def _psi(self, h): + """ + psi value used for calculating standard error. + + Returns [psi_0, psi_1, ..., psi_(h - 1)] + """ + k = self._k + result = [np.eye(k)] + for i in xrange(1, h): + result.append(sum( + [np.dot(result[i - j], self._lag_betas[j - 1]) + for j in xrange(1, 1 + i) + if j <= self._p])) + + return result + + @cache_readonly + def _resid_raw(self): + resid = np.array([self.ols_results[col]._resid_raw + for col in self._columns]) + return resid + + @cache_readonly + def _rss(self): + """Returns the sum of the squares of the residuals.""" + return (self._resid_raw ** 2).sum() + + @cache_readonly + def _sigma(self): + """Returns covariance of resids.""" + k = self._k + n = self._nobs + + resid = self._resid_raw + + return np.dot(resid, resid.T) / (n - k) + + def __repr__(self): + return self.summary + +def lag_select(data, max_lags=5, ic=None): + """ + Select number of lags based on a variety of information criteria + + Parameters + ---------- + data : DataFrame-like + max_lags : int + Maximum number of lags to evaluate + ic : {None, 'aic', 'bic', ...} + Choosing None will just display the results + + Returns + ------- + None + """ + pass + +class PanelVAR(VAR): + """ + Performs Vector Autoregression on panel data. + + Parameters + ---------- + data: Panel or dict of DataFrame + lags: int + """ + def __init__(self, data, lags, intercept=True): + self._data = _prep_panel_data(data) + self._p = lags + self._intercept = intercept + + self._columns = self._data.items + + @cache_readonly + def _nobs(self): + """Returns the number of observations.""" + _, timesteps, entities = self._data.values.shape + return (timesteps - self._p) * entities + + @cache_readonly + def _rss(self): + """Returns the sum of the squares of the residuals.""" + return (self.resid.values ** 2).sum() + + def forecast(self, h): + """ + Returns the forecasts at 1, 2, ..., n timesteps in the future. + """ + forecast = self._forecast_raw(h).T.swapaxes(1, 2) + index = xrange(1, 1 + h) + w = Panel(forecast, items=self._data.items, major_axis=index, + minor_axis=self._data.minor_axis) + return w + + @cache_readonly + def resid(self): + """ + Returns the DataFrame containing the residuals of the VAR regressions. + Each column x1 contains the residuals generated by regressing the x1 + column of the input against the lagged input. + + Returns + ------- + DataFrame + """ + d = dict([(key, value.resid) + for (key, value) in self.ols_results.iteritems()]) + return Panel.fromDict(d) + + def _data_xs(self, i): + return self._data.values[:, i, :].T + + @cache_readonly + def _sigma(self): + """Returns covariance of resids.""" + k = self._k + resid = _drop_incomplete_rows(self.resid.toLong().values) + n = len(resid) + return np.dot(resid.T, resid) / (n - k) + + +def _prep_panel_data(data): + """Converts the given data into a Panel.""" + if isinstance(data, Panel): + return data + + return Panel.fromDict(data) + +def _drop_incomplete_rows(array): + mask = np.isfinite(array).all(1) + indices = np.arange(len(array))[mask] + return array.take(indices, 0) + +def _make_param_name(lag, name): + return 'L%d.%s' % (lag, name) + +def chain_dot(*matrices): + """ + Returns the dot product of the given matrices. + + Parameters + ---------- + matrices: argument list of ndarray + """ + return reduce(lambda x, y: np.dot(y, x), matrices[::-1]) diff --git a/pandas/tests/__init__.py b/pandas/tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/pandas/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/pandas/tests/data/iris.csv b/pandas/tests/data/iris.csv new file mode 100644 index 00000000..c19b9c36 --- /dev/null +++ b/pandas/tests/data/iris.csv @@ -0,0 +1,151 @@ +SepalLength,SepalWidth,PetalLength,PetalWidth,Name +5.1,3.5,1.4,0.2,Iris-setosa +4.9,3.0,1.4,0.2,Iris-setosa +4.7,3.2,1.3,0.2,Iris-setosa +4.6,3.1,1.5,0.2,Iris-setosa +5.0,3.6,1.4,0.2,Iris-setosa +5.4,3.9,1.7,0.4,Iris-setosa +4.6,3.4,1.4,0.3,Iris-setosa +5.0,3.4,1.5,0.2,Iris-setosa +4.4,2.9,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.4,3.7,1.5,0.2,Iris-setosa +4.8,3.4,1.6,0.2,Iris-setosa +4.8,3.0,1.4,0.1,Iris-setosa +4.3,3.0,1.1,0.1,Iris-setosa +5.8,4.0,1.2,0.2,Iris-setosa +5.7,4.4,1.5,0.4,Iris-setosa +5.4,3.9,1.3,0.4,Iris-setosa +5.1,3.5,1.4,0.3,Iris-setosa +5.7,3.8,1.7,0.3,Iris-setosa +5.1,3.8,1.5,0.3,Iris-setosa +5.4,3.4,1.7,0.2,Iris-setosa +5.1,3.7,1.5,0.4,Iris-setosa +4.6,3.6,1.0,0.2,Iris-setosa +5.1,3.3,1.7,0.5,Iris-setosa +4.8,3.4,1.9,0.2,Iris-setosa +5.0,3.0,1.6,0.2,Iris-setosa +5.0,3.4,1.6,0.4,Iris-setosa +5.2,3.5,1.5,0.2,Iris-setosa +5.2,3.4,1.4,0.2,Iris-setosa +4.7,3.2,1.6,0.2,Iris-setosa +4.8,3.1,1.6,0.2,Iris-setosa +5.4,3.4,1.5,0.4,Iris-setosa +5.2,4.1,1.5,0.1,Iris-setosa +5.5,4.2,1.4,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +5.0,3.2,1.2,0.2,Iris-setosa +5.5,3.5,1.3,0.2,Iris-setosa +4.9,3.1,1.5,0.1,Iris-setosa +4.4,3.0,1.3,0.2,Iris-setosa +5.1,3.4,1.5,0.2,Iris-setosa +5.0,3.5,1.3,0.3,Iris-setosa +4.5,2.3,1.3,0.3,Iris-setosa +4.4,3.2,1.3,0.2,Iris-setosa +5.0,3.5,1.6,0.6,Iris-setosa +5.1,3.8,1.9,0.4,Iris-setosa +4.8,3.0,1.4,0.3,Iris-setosa +5.1,3.8,1.6,0.2,Iris-setosa +4.6,3.2,1.4,0.2,Iris-setosa +5.3,3.7,1.5,0.2,Iris-setosa +5.0,3.3,1.4,0.2,Iris-setosa +7.0,3.2,4.7,1.4,Iris-versicolor +6.4,3.2,4.5,1.5,Iris-versicolor +6.9,3.1,4.9,1.5,Iris-versicolor +5.5,2.3,4.0,1.3,Iris-versicolor +6.5,2.8,4.6,1.5,Iris-versicolor +5.7,2.8,4.5,1.3,Iris-versicolor +6.3,3.3,4.7,1.6,Iris-versicolor +4.9,2.4,3.3,1.0,Iris-versicolor +6.6,2.9,4.6,1.3,Iris-versicolor +5.2,2.7,3.9,1.4,Iris-versicolor +5.0,2.0,3.5,1.0,Iris-versicolor +5.9,3.0,4.2,1.5,Iris-versicolor +6.0,2.2,4.0,1.0,Iris-versicolor +6.1,2.9,4.7,1.4,Iris-versicolor +5.6,2.9,3.6,1.3,Iris-versicolor +6.7,3.1,4.4,1.4,Iris-versicolor +5.6,3.0,4.5,1.5,Iris-versicolor +5.8,2.7,4.1,1.0,Iris-versicolor +6.2,2.2,4.5,1.5,Iris-versicolor +5.6,2.5,3.9,1.1,Iris-versicolor +5.9,3.2,4.8,1.8,Iris-versicolor +6.1,2.8,4.0,1.3,Iris-versicolor +6.3,2.5,4.9,1.5,Iris-versicolor +6.1,2.8,4.7,1.2,Iris-versicolor +6.4,2.9,4.3,1.3,Iris-versicolor +6.6,3.0,4.4,1.4,Iris-versicolor +6.8,2.8,4.8,1.4,Iris-versicolor +6.7,3.0,5.0,1.7,Iris-versicolor +6.0,2.9,4.5,1.5,Iris-versicolor +5.7,2.6,3.5,1.0,Iris-versicolor +5.5,2.4,3.8,1.1,Iris-versicolor +5.5,2.4,3.7,1.0,Iris-versicolor +5.8,2.7,3.9,1.2,Iris-versicolor +6.0,2.7,5.1,1.6,Iris-versicolor +5.4,3.0,4.5,1.5,Iris-versicolor +6.0,3.4,4.5,1.6,Iris-versicolor +6.7,3.1,4.7,1.5,Iris-versicolor +6.3,2.3,4.4,1.3,Iris-versicolor +5.6,3.0,4.1,1.3,Iris-versicolor +5.5,2.5,4.0,1.3,Iris-versicolor +5.5,2.6,4.4,1.2,Iris-versicolor +6.1,3.0,4.6,1.4,Iris-versicolor +5.8,2.6,4.0,1.2,Iris-versicolor +5.0,2.3,3.3,1.0,Iris-versicolor +5.6,2.7,4.2,1.3,Iris-versicolor +5.7,3.0,4.2,1.2,Iris-versicolor +5.7,2.9,4.2,1.3,Iris-versicolor +6.2,2.9,4.3,1.3,Iris-versicolor +5.1,2.5,3.0,1.1,Iris-versicolor +5.7,2.8,4.1,1.3,Iris-versicolor +6.3,3.3,6.0,2.5,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +7.1,3.0,5.9,2.1,Iris-virginica +6.3,2.9,5.6,1.8,Iris-virginica +6.5,3.0,5.8,2.2,Iris-virginica +7.6,3.0,6.6,2.1,Iris-virginica +4.9,2.5,4.5,1.7,Iris-virginica +7.3,2.9,6.3,1.8,Iris-virginica +6.7,2.5,5.8,1.8,Iris-virginica +7.2,3.6,6.1,2.5,Iris-virginica +6.5,3.2,5.1,2.0,Iris-virginica +6.4,2.7,5.3,1.9,Iris-virginica +6.8,3.0,5.5,2.1,Iris-virginica +5.7,2.5,5.0,2.0,Iris-virginica +5.8,2.8,5.1,2.4,Iris-virginica +6.4,3.2,5.3,2.3,Iris-virginica +6.5,3.0,5.5,1.8,Iris-virginica +7.7,3.8,6.7,2.2,Iris-virginica +7.7,2.6,6.9,2.3,Iris-virginica +6.0,2.2,5.0,1.5,Iris-virginica +6.9,3.2,5.7,2.3,Iris-virginica +5.6,2.8,4.9,2.0,Iris-virginica +7.7,2.8,6.7,2.0,Iris-virginica +6.3,2.7,4.9,1.8,Iris-virginica +6.7,3.3,5.7,2.1,Iris-virginica +7.2,3.2,6.0,1.8,Iris-virginica +6.2,2.8,4.8,1.8,Iris-virginica +6.1,3.0,4.9,1.8,Iris-virginica +6.4,2.8,5.6,2.1,Iris-virginica +7.2,3.0,5.8,1.6,Iris-virginica +7.4,2.8,6.1,1.9,Iris-virginica +7.9,3.8,6.4,2.0,Iris-virginica +6.4,2.8,5.6,2.2,Iris-virginica +6.3,2.8,5.1,1.5,Iris-virginica +6.1,2.6,5.6,1.4,Iris-virginica +7.7,3.0,6.1,2.3,Iris-virginica +6.3,3.4,5.6,2.4,Iris-virginica +6.4,3.1,5.5,1.8,Iris-virginica +6.0,3.0,4.8,1.8,Iris-virginica +6.9,3.1,5.4,2.1,Iris-virginica +6.7,3.1,5.6,2.4,Iris-virginica +6.9,3.1,5.1,2.3,Iris-virginica +5.8,2.7,5.1,1.9,Iris-virginica +6.8,3.2,5.9,2.3,Iris-virginica +6.7,3.3,5.7,2.5,Iris-virginica +6.7,3.0,5.2,2.3,Iris-virginica +6.3,2.5,5.0,1.9,Iris-virginica +6.5,3.0,5.2,2.0,Iris-virginica +6.2,3.4,5.4,2.3,Iris-virginica +5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/pandas/tests/data/mindex_073.pickle b/pandas/tests/data/mindex_073.pickle new file mode 100644 index 0000000000000000000000000000000000000000..c99f51fa289ac53b0301d25dc3dec5cc61b670e0 GIT binary patch literal 670 zcmaJ;%TB{E5R8+Shj~Lw%i{xb>IoqZNSupBKu9@8C`pZ6i6-6PN>t*&B}V;fSf|-m zAc_weC$lp>-DOeX0Ej4t(`j>VBp# zFY45Xi_}LpwO1+(I;5bjD(LmkpbiENyH0&rjc$!P^q%t^+NDVpQ%n0}Yu_wYYZm&e zv|dd#mR?C*KnJ~K1YJ8#wJDB!s5S&Ai} 'a'] + expected = self.factor[np.asarray(self.factor) > 'a'] + self.assert_(result.equals(expected)) + + result = self.factor[self.factor >= 'b'] + expected = self.factor[np.asarray(self.factor) >= 'b'] + self.assert_(result.equals(expected)) + + result = self.factor[self.factor <= 'b'] + expected = self.factor[np.asarray(self.factor) <= 'b'] + self.assert_(result.equals(expected)) + + n = len(self.factor) + + other = self.factor[np.random.permutation(n)] + result = self.factor == other + expected = np.asarray(self.factor) == np.asarray(other) + self.assert_(np.array_equal(result, expected)) + + result = self.factor == 'd' + expected = np.repeat(False, len(self.factor)) + self.assert_(np.array_equal(result, expected)) + + def test_value_counts(self): + from pandas.tools.tile import cut + + arr = np.random.randn(4) + factor = cut(arr, 4) + + self.assert_(isinstance(factor, Categorical)) + + result = value_counts(factor) + expected = value_counts(np.asarray(factor)) + tm.assert_series_equal(result, expected) + + def test_na_flags_int_levels(self): + # #1457 + + levels = range(10) + labels = np.random.randint(0, 10, 20) + labels[::5] = -1 + + cat = Categorical(labels, levels) + repr(cat) + + self.assert_(np.array_equal(com.isnull(cat), labels == -1)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) + + diff --git a/pandas/tests/test_format.py b/pandas/tests/test_format.py new file mode 100644 index 00000000..0f02eba0 --- /dev/null +++ b/pandas/tests/test_format.py @@ -0,0 +1,779 @@ +try: + from StringIO import StringIO +except: + from io import StringIO + +import os +import sys +import unittest + +from numpy import nan +from numpy.random import randn +import numpy as np + +from pandas import DataFrame, Series, Index +import pandas.core.format as fmt +import pandas.util.testing as tm +import pandas + +_frame = DataFrame(tm.getSeriesData()) + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + +class TestDataFrameFormatting(unittest.TestCase): + + def setUp(self): + self.frame = _frame.copy() + + def test_repr_embedded_ndarray(self): + arr = np.empty(10, dtype=[('err', object)]) + for i in range(len(arr)): + arr['err'][i] = np.random.randn(i) + + df = DataFrame(arr) + repr(df['err']) + repr(df) + df.to_string() + + def test_eng_float_formatter(self): + self.frame.ix[5] = 0 + + fmt.set_eng_float_format() + result = repr(self.frame) + + fmt.set_eng_float_format(use_eng_prefix=True) + repr(self.frame) + + fmt.set_eng_float_format(accuracy=0) + repr(self.frame) + + fmt.reset_printoptions() + + def test_repr_tuples(self): + buf = StringIO() + + df = DataFrame({'tups' : zip(range(10), range(10))}) + repr(df) + df.to_string(col_space=10, buf=buf) + + def test_to_string_repr_unicode(self): + buf = StringIO() + + unicode_values = [u'\u03c3'] * 10 + unicode_values = np.array(unicode_values, dtype=object) + df = DataFrame({'unicode' : unicode_values}) + df.to_string(col_space=10, buf=buf) + + # it works! + repr(df) + + idx = Index(['abc', u'\u03c3a', 'aegdvg']) + ser = Series(np.random.randn(len(idx)), idx) + rs = repr(ser).split('\n') + line_len = len(rs[0]) + for line in rs[1:]: + try: + line = line.decode('utf-8') + except: + pass + self.assert_(len(line) == line_len) + + # it works even if sys.stdin in None + sys.stdin = None + repr(df) + sys.stdin = sys.__stdin__ + + def test_to_string_unicode_columns(self): + df = DataFrame({u'\u03c3' : np.arange(10.)}) + + buf = StringIO() + df.to_string(buf=buf) + buf.getvalue() + + buf = StringIO() + df.info(buf=buf) + buf.getvalue() + + result = self.frame.to_string(force_unicode=True) + self.assert_(isinstance(result, unicode)) + + def test_to_string_unicode_two(self): + dm = DataFrame({u'c/\u03c3': []}) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_unicode_three(self): + dm = DataFrame(['\xc2']) + buf = StringIO() + dm.to_string(buf) + + def test_to_string_with_formatters(self): + df = DataFrame({'int': [1, 2, 3], + 'float': [1.0, 2.0, 3.0], + 'object': [(1,2), True, False]}, + columns=['int', 'float', 'object']) + + result = df.to_string(formatters={'int': lambda x: '0x%x' % x, + 'float': lambda x: '[% 4.1f]' % x, + 'object': lambda x: '-%s-' % str(x)}) + self.assertEqual(result, (' int float object\n' + '0 0x1 [ 1.0] -(1, 2)-\n' + '1 0x2 [ 2.0] -True-\n' + '2 0x3 [ 3.0] -False-')) + + def test_to_string_with_formatters_unicode(self): + df = DataFrame({u'c/\u03c3':[1,2,3]}) + result = df.to_string(formatters={u'c/\u03c3': lambda x: '%s' % x}) + self.assertEqual(result, (u' c/\u03c3\n' + '0 1\n' + '1 2\n' + '2 3')) + + def test_to_string_buffer_all_unicode(self): + buf = StringIO() + + empty = DataFrame({u'c/\u03c3':Series()}) + nonempty = DataFrame({u'c/\u03c3':Series([1,2,3])}) + + print >>buf, empty + print >>buf, nonempty + + # this should work + buf.getvalue() + + def test_to_html_unicode(self): + # it works! + df = DataFrame({u'\u03c3' : np.arange(10.)}) + df.to_html() + df = DataFrame({'A' : [u'\u03c3']}) + df.to_html() + + def test_unicode_problem_decoding_as_ascii(self): + dm = DataFrame({u'c/\u03c3': Series({'test':np.NaN})}) + unicode(dm.to_string()) + + def test_string_repr_encoding(self): + pth = curpath() + filepath = os.path.join(pth, 'data', 'unicode_series.csv') + df = pandas.read_csv(filepath, header=None) + repr(df) + repr(df['X.2']) + + def test_repr_corner(self): + # representing infs poses no problems + df = DataFrame({'foo' : np.inf * np.empty(10)}) + foo = repr(df) + + def test_frame_info_encoding(self): + index = ['\'Til There Was You (1997)', + '\xc1 k\xf6ldum klaka (Cold Fever) (1994)'] + fmt.set_printoptions(max_rows=1) + df = DataFrame(columns=['a', 'b', 'c'], index=index) + repr(df) + repr(df.T) + fmt.set_printoptions(max_rows=200) + + def test_to_string(self): + from pandas import read_table + import re + + # big mixed + biggie = DataFrame({'A' : randn(200), + 'B' : tm.makeStringIndex(200)}, + index=range(200)) + + biggie['A'][:20] = nan + biggie['B'][:20] = nan + s = biggie.to_string() + + buf = StringIO() + retval = biggie.to_string(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue(), s) + + self.assert_(isinstance(s, basestring)) + + # print in right order + result = biggie.to_string(columns=['B', 'A'], col_space=17, + float_format='%.5f'.__mod__) + lines = result.split('\n') + header = lines[0].strip().split() + joined = '\n'.join([re.sub('\s+', ' ', x).strip() for x in lines[1:]]) + recons = read_table(StringIO(joined), names=header, sep=' ') + tm.assert_series_equal(recons['B'], biggie['B']) + self.assertEqual(recons['A'].count(), biggie['A'].count()) + self.assert_((np.abs(recons['A'].dropna() - + biggie['A'].dropna()) < 0.1).all()) + + # expected = ['B', 'A'] + # self.assertEqual(header, expected) + + result = biggie.to_string(columns=['A'], col_space=17) + header = result.split('\n')[0].strip().split() + expected = ['A'] + self.assertEqual(header, expected) + + biggie.to_string(columns=['B', 'A'], + formatters={'A' : lambda x: '%.1f' % x}) + + biggie.to_string(columns=['B', 'A'], float_format=str) + biggie.to_string(columns=['B', 'A'], col_space=12, + float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_string() + + def test_to_string_no_header(self): + df = DataFrame({'x' : [1, 2, 3], + 'y' : [4, 5, 6]}) + + df_s = df.to_string(header=False) + expected = "0 1 4\n1 2 5\n2 3 6" + + assert(df_s == expected) + + def test_to_string_no_index(self): + df = DataFrame({'x' : [1, 2, 3], + 'y' : [4, 5, 6]}) + + df_s = df.to_string(index=False) + expected = " x y\n 1 4\n 2 5\n 3 6" + + assert(df_s == expected) + + def test_to_string_float_formatting(self): + fmt.reset_printoptions() + fmt.set_printoptions(precision=6, column_space=12, + notebook_repr_html=False) + + df = DataFrame({'x' : [0, 0.25, 3456.000, 12e+45, 1.64e+6, + 1.7e+8, 1.253456, np.pi, -1e6]}) + + df_s = df.to_string() + + # Python 2.5 just wants me to be sad. And debian 32-bit + #sys.version_info[0] == 2 and sys.version_info[1] < 6: + if '%.4g' % 1.7e8 == '1.7e+008': + expected = (' x\n0 0.00000e+000\n1 2.50000e-001\n' + '2 3.45600e+003\n3 1.20000e+046\n4 1.64000e+006\n' + '5 1.70000e+008\n6 1.25346e+000\n7 3.14159e+000\n' + '8 -1.00000e+006') + else: + expected = (' x\n0 0.00000e+00\n1 2.50000e-01\n' + '2 3.45600e+03\n3 1.20000e+46\n4 1.64000e+06\n' + '5 1.70000e+08\n6 1.25346e+00\n7 3.14159e+00\n' + '8 -1.00000e+06') + assert(df_s == expected) + + df = DataFrame({'x' : [3234, 0.253]}) + df_s = df.to_string() + + expected = (' x\n' + '0 3234.000\n' + '1 0.253') + assert(df_s == expected) + + fmt.reset_printoptions() + self.assertEqual(fmt.print_config.precision, 7) + + df = DataFrame({'x': [1e9, 0.2512]}) + df_s = df.to_string() + # Python 2.5 just wants me to be sad. And debian 32-bit + #sys.version_info[0] == 2 and sys.version_info[1] < 6: + if '%.4g' % 1.7e8 == '1.7e+008': + expected = (' x\n' + '0 1.000000e+009\n' + '1 2.512000e-001') + else: + expected = (' x\n' + '0 1.000000e+09\n' + '1 2.512000e-01') + assert(df_s == expected) + + def test_to_string_float_index(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(range(5), index=index) + + result = df.to_string() + expected = (' 0\n' + '1.5 0\n' + '2 1\n' + '3 2\n' + '4 3\n' + '5 4') + self.assertEqual(result, expected) + + def test_to_string_ascii_error(self): + data = [('0 ', + u' .gitignore ', + u' 5 ', + ' \xe2\x80\xa2\xe2\x80\xa2\xe2\x80' + '\xa2\xe2\x80\xa2\xe2\x80\xa2')] + df = DataFrame(data) + + # it works! + repr(df) + + def test_to_string_int_formatting(self): + df = DataFrame({'x' : [-15, 20, 25, -35]}) + self.assert_(issubclass(df['x'].dtype.type, np.integer)) + + output = df.to_string() + self.assert_(isinstance(output, str)) + expected = (' x\n' + '0 -15\n' + '1 20\n' + '2 25\n' + '3 -35') + self.assertEqual(output, expected) + + def test_to_string_left_justify_cols(self): + fmt.reset_printoptions() + df = DataFrame({'x' : [3234, 0.253]}) + df_s = df.to_string(justify='left') + expected = (' x \n' + '0 3234.000\n' + '1 0.253') + assert(df_s == expected) + + def test_to_string_format_na(self): + fmt.reset_printoptions() + df = DataFrame({'A' : [np.nan, -1, -2.1234, 3, 4], + 'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + result = df.to_string() + + expected = (' A B\n' + '0 NaN NaN\n' + '1 -1.0000 foo\n' + '2 -2.1234 foooo\n' + '3 3.0000 fooooo\n' + '4 4.0000 bar') + self.assertEqual(result, expected) + + df = DataFrame({'A' : [np.nan, -1., -2., 3., 4.], + 'B' : [np.nan, 'foo', 'foooo', 'fooooo', 'bar']}) + result = df.to_string() + + expected = (' A B\n' + '0 NaN NaN\n' + '1 -1 foo\n' + '2 -2 foooo\n' + '3 3 fooooo\n' + '4 4 bar') + self.assertEqual(result, expected) + + def test_to_html(self): + # big mixed + biggie = DataFrame({'A' : randn(200), + 'B' : tm.makeStringIndex(200)}, + index=range(200)) + + biggie['A'][:20] = nan + biggie['B'][:20] = nan + s = biggie.to_html() + + buf = StringIO() + retval = biggie.to_html(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue(), s) + + self.assert_(isinstance(s, basestring)) + + biggie.to_html(columns=['B', 'A'], col_space=17) + biggie.to_html(columns=['B', 'A'], + formatters={'A' : lambda x: '%.1f' % x}) + + biggie.to_html(columns=['B', 'A'], float_format=str) + biggie.to_html(columns=['B', 'A'], col_space=12, + float_format=str) + + frame = DataFrame(index=np.arange(200)) + frame.to_html() + + def test_to_html_with_no_bold(self): + x = DataFrame({'x': randn(5)}) + ashtml = x.to_html(bold_rows=False) + assert('' not in ashtml) + + def test_to_html_columns_arg(self): + result = self.frame.to_html(columns=['A']) + self.assert_('B' not in result) + + def test_to_html_multiindex(self): + columns = pandas.MultiIndex.from_tuples(zip(range(4), + np.mod(range(4), 2)), + names=['CL0', 'CL1']) + df = pandas.DataFrame([list('abcd'), list('efgh')], columns=columns) + result = df.to_html() + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
' + '
CL0
CL1
' + '
0
0
' + '
1
1
' + '
2
0
' + '
3
1
0 a b c d
1 e f g h
') + self.assertEqual(result, expected) + + columns = pandas.MultiIndex.from_tuples(zip(range(4), + np.mod(range(4), 2))) + df = pandas.DataFrame([list('abcd'), list('efgh')], columns=columns) + result = df.to_html() + expected = ('\n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + ' \n' + '
' + '
0
0
' + '
1
1
' + '
2
0
' + '
3
1
0 a b c d
1 e f g h
') + self.assertEqual(result, expected) + + def test_repr_html(self): + self.frame._repr_html_() + + fmt.set_printoptions(max_rows=1, max_columns=1) + self.frame._repr_html_() + + fmt.set_printoptions(notebook_repr_html=False) + self.frame._repr_html_() + + fmt.reset_printoptions() + +class TestSeriesFormatting(unittest.TestCase): + + def setUp(self): + self.ts = tm.makeTimeSeries() + + def test_repr_unicode(self): + s = Series([u'\u03c3'] * 10) + repr(s) + + def test_to_string(self): + buf = StringIO() + + s = self.ts.to_string() + + retval = self.ts.to_string(buf=buf) + self.assert_(retval is None) + self.assertEqual(buf.getvalue().strip(), s) + + # pass float_format + format = '%.4f'.__mod__ + result = self.ts.to_string(float_format=format) + result = [x.split()[1] for x in result.split('\n')] + expected = [format(x) for x in self.ts] + self.assertEqual(result, expected) + + # empty string + result = self.ts[:0].to_string() + self.assertEqual(result, '') + + result = self.ts[:0].to_string(length=0) + self.assertEqual(result, '') + + # name and length + cp = self.ts.copy() + cp.name = 'foo' + result = cp.to_string(length=True, name=True) + last_line = result.split('\n')[-1].strip() + self.assertEqual(last_line, "Freq: B, Name: foo, Length: %d" % len(cp)) + + def test_to_string_mixed(self): + s = Series(['foo', np.nan, -1.23, 4.56]) + result = s.to_string() + expected = ('0 foo\n' + '1 NaN\n' + '2 -1.23\n' + '3 4.56') + self.assertEqual(result, expected) + + # but don't count NAs as floats + s = Series(['foo', np.nan, 'bar', 'baz']) + result = s.to_string() + expected = ('0 foo\n' + '1 NaN\n' + '2 bar\n' + '3 baz') + self.assertEqual(result, expected) + + s = Series(['foo', 5, 'bar', 'baz']) + result = s.to_string() + expected = ('0 foo\n' + '1 5\n' + '2 bar\n' + '3 baz') + self.assertEqual(result, expected) + + def test_to_string_float_na_spacing(self): + s = Series([0., 1.5678, 2., -3., 4.]) + s[::2] = np.nan + + result = s.to_string() + expected = ('0 NaN\n' + '1 1.5678\n' + '2 NaN\n' + '3 -3.0000\n' + '4 NaN') + self.assertEqual(result, expected) + +class TestEngFormatter(unittest.TestCase): + + def test_eng_float_formatter(self): + df = DataFrame({'A' : [1.41, 141., 14100, 1410000.]}) + + fmt.set_eng_float_format() + result = df.to_string() + expected = (' A\n' + '0 1.410E+00\n' + '1 141.000E+00\n' + '2 14.100E+03\n' + '3 1.410E+06') + self.assertEqual(result, expected) + + fmt.set_eng_float_format(use_eng_prefix=True) + result = df.to_string() + expected = (' A\n' + '0 1.410\n' + '1 141.000\n' + '2 14.100k\n' + '3 1.410M') + self.assertEqual(result, expected) + + fmt.set_eng_float_format(accuracy=0) + result = df.to_string() + expected = (' A\n' + '0 1E+00\n' + '1 141E+00\n' + '2 14E+03\n' + '3 1E+06') + self.assertEqual(result, expected) + + fmt.reset_printoptions() + + def compare(self, formatter, input, output): + formatted_input = formatter(input) + msg = ("formatting of %s results in '%s', expected '%s'" + % (str(input), formatted_input, output)) + self.assertEqual(formatted_input, output, msg) + + def compare_all(self, formatter, in_out): + """ + Parameters: + ----------- + formatter: EngFormatter under test + in_out: list of tuples. Each tuple = (number, expected_formatting) + + It is tested if 'formatter(number) == expected_formatting'. + *number* should be >= 0 because formatter(-number) == fmt is also + tested. *fmt* is derived from *expected_formatting* + """ + for input, output in in_out: + self.compare(formatter, input, output) + self.compare(formatter, -input, "-" + output[1:]) + + def test_exponents_with_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + f = np.sqrt(2) + in_out = [(f * 10 ** -24, " 1.414y"), + (f * 10 ** -23, " 14.142y"), + (f * 10 ** -22, " 141.421y"), + (f * 10 ** -21, " 1.414z"), + (f * 10 ** -20, " 14.142z"), + (f * 10 ** -19, " 141.421z"), + (f * 10 ** -18, " 1.414a"), + (f * 10 ** -17, " 14.142a"), + (f * 10 ** -16, " 141.421a"), + (f * 10 ** -15, " 1.414f"), + (f * 10 ** -14, " 14.142f"), + (f * 10 ** -13, " 141.421f"), + (f * 10 ** -12, " 1.414p"), + (f * 10 ** -11, " 14.142p"), + (f * 10 ** -10, " 141.421p"), + (f * 10 ** -9, " 1.414n"), + (f * 10 ** -8, " 14.142n"), + (f * 10 ** -7, " 141.421n"), + (f * 10 ** -6, " 1.414u"), + (f * 10 ** -5, " 14.142u"), + (f * 10 ** -4, " 141.421u"), + (f * 10 ** -3, " 1.414m"), + (f * 10 ** -2, " 14.142m"), + (f * 10 ** -1, " 141.421m"), + (f * 10 ** 0, " 1.414"), + (f * 10 ** 1, " 14.142"), + (f * 10 ** 2, " 141.421"), + (f * 10 ** 3, " 1.414k"), + (f * 10 ** 4, " 14.142k"), + (f * 10 ** 5, " 141.421k"), + (f * 10 ** 6, " 1.414M"), + (f * 10 ** 7, " 14.142M"), + (f * 10 ** 8, " 141.421M"), + (f * 10 ** 9, " 1.414G"), + (f * 10 ** 10, " 14.142G"), + (f * 10 ** 11, " 141.421G"), + (f * 10 ** 12, " 1.414T"), + (f * 10 ** 13, " 14.142T"), + (f * 10 ** 14, " 141.421T"), + (f * 10 ** 15, " 1.414P"), + (f * 10 ** 16, " 14.142P"), + (f * 10 ** 17, " 141.421P"), + (f * 10 ** 18, " 1.414E"), + (f * 10 ** 19, " 14.142E"), + (f * 10 ** 20, " 141.421E"), + (f * 10 ** 21, " 1.414Z"), + (f * 10 ** 22, " 14.142Z"), + (f * 10 ** 23, " 141.421Z"), + (f * 10 ** 24, " 1.414Y"), + (f * 10 ** 25, " 14.142Y"), + (f * 10 ** 26, " 141.421Y")] + self.compare_all(formatter, in_out) + + def test_exponents_without_eng_prefix(self): + formatter = fmt.EngFormatter(accuracy=4, use_eng_prefix=False) + f = np.pi + in_out = [(f * 10 ** -24, " 3.1416E-24"), + (f * 10 ** -23, " 31.4159E-24"), + (f * 10 ** -22, " 314.1593E-24"), + (f * 10 ** -21, " 3.1416E-21"), + (f * 10 ** -20, " 31.4159E-21"), + (f * 10 ** -19, " 314.1593E-21"), + (f * 10 ** -18, " 3.1416E-18"), + (f * 10 ** -17, " 31.4159E-18"), + (f * 10 ** -16, " 314.1593E-18"), + (f * 10 ** -15, " 3.1416E-15"), + (f * 10 ** -14, " 31.4159E-15"), + (f * 10 ** -13, " 314.1593E-15"), + (f * 10 ** -12, " 3.1416E-12"), + (f * 10 ** -11, " 31.4159E-12"), + (f * 10 ** -10, " 314.1593E-12"), + (f * 10 ** -9, " 3.1416E-09"), + (f * 10 ** -8, " 31.4159E-09"), + (f * 10 ** -7, " 314.1593E-09"), + (f * 10 ** -6, " 3.1416E-06"), + (f * 10 ** -5, " 31.4159E-06"), + (f * 10 ** -4, " 314.1593E-06"), + (f * 10 ** -3, " 3.1416E-03"), + (f * 10 ** -2, " 31.4159E-03"), + (f * 10 ** -1, " 314.1593E-03"), + (f * 10 ** 0, " 3.1416E+00"), + (f * 10 ** 1, " 31.4159E+00"), + (f * 10 ** 2, " 314.1593E+00"), + (f * 10 ** 3, " 3.1416E+03"), + (f * 10 ** 4, " 31.4159E+03"), + (f * 10 ** 5, " 314.1593E+03"), + (f * 10 ** 6, " 3.1416E+06"), + (f * 10 ** 7, " 31.4159E+06"), + (f * 10 ** 8, " 314.1593E+06"), + (f * 10 ** 9, " 3.1416E+09"), + (f * 10 ** 10, " 31.4159E+09"), + (f * 10 ** 11, " 314.1593E+09"), + (f * 10 ** 12, " 3.1416E+12"), + (f * 10 ** 13, " 31.4159E+12"), + (f * 10 ** 14, " 314.1593E+12"), + (f * 10 ** 15, " 3.1416E+15"), + (f * 10 ** 16, " 31.4159E+15"), + (f * 10 ** 17, " 314.1593E+15"), + (f * 10 ** 18, " 3.1416E+18"), + (f * 10 ** 19, " 31.4159E+18"), + (f * 10 ** 20, " 314.1593E+18"), + (f * 10 ** 21, " 3.1416E+21"), + (f * 10 ** 22, " 31.4159E+21"), + (f * 10 ** 23, " 314.1593E+21"), + (f * 10 ** 24, " 3.1416E+24"), + (f * 10 ** 25, " 31.4159E+24"), + (f * 10 ** 26, " 314.1593E+24")] + self.compare_all(formatter, in_out) + + def test_rounding(self): + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + in_out = [(5.55555, ' 5.556'), + (55.5555, ' 55.556'), + (555.555, ' 555.555'), + (5555.55, ' 5.556k'), + (55555.5, ' 55.556k'), + (555555, ' 555.555k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=1, use_eng_prefix=True) + in_out = [(5.55555, ' 5.6'), + (55.5555, ' 55.6'), + (555.555, ' 555.6'), + (5555.55, ' 5.6k'), + (55555.5, ' 55.6k'), + (555555, ' 555.6k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=0, use_eng_prefix=True) + in_out = [(5.55555, ' 6'), + (55.5555, ' 56'), + (555.555, ' 556'), + (5555.55, ' 6k'), + (55555.5, ' 56k'), + (555555, ' 556k')] + self.compare_all(formatter, in_out) + + formatter = fmt.EngFormatter(accuracy=3, use_eng_prefix=True) + result = formatter(0) + self.assertEqual(result, u' 0.000') + + +class TestFloatArrayFormatter(unittest.TestCase): + + def test_misc(self): + obj = fmt.FloatArrayFormatter(np.array([], dtype=np.float64)) + result = obj.get_result() + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py new file mode 100644 index 00000000..3e5977a1 --- /dev/null +++ b/pandas/tests/test_frame.py @@ -0,0 +1,6515 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import nose + +from numpy import random, nan +from numpy.random import randn +import numpy as np +import numpy.ma as ma +from numpy.testing import assert_array_equal + +import pandas as pan +import pandas.core.nanops as nanops +import pandas.core.common as com +import pandas.core.format as fmt +import pandas.core.datetools as datetools +from pandas.core.api import (DataFrame, Index, Series, notnull, isnull, + MultiIndex, DatetimeIndex) +from pandas.io.parsers import (ExcelFile, ExcelWriter, read_csv) + +from pandas.util.testing import (assert_almost_equal, + assert_series_equal, + assert_frame_equal) + +import pandas.util.testing as tm +import pandas.lib as lib + +def _skip_if_no_scipy(): + try: + import scipy.stats + except ImportError: + raise nose.SkipTest + +#------------------------------------------------------------------------------- +# DataFrame test cases + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + +class CheckIndexing(object): + + def test_getitem(self): + # slicing + + sl = self.frame[:20] + self.assertEqual(20, len(sl.index)) + + # column access + + for _, series in sl.iteritems(): + self.assertEqual(20, len(series.index)) + self.assert_(tm.equalContents(series.index, sl.index)) + + for key, _ in self.frame._series.iteritems(): + self.assert_(self.frame[key] is not None) + + self.assert_('random' not in self.frame) + self.assertRaises(Exception, self.frame.__getitem__, 'random') + + def test_get(self): + b = self.frame.get('B') + assert_series_equal(b, self.frame['B']) + + self.assert_(self.frame.get('foo') is None) + assert_series_equal(self.frame.get('foo', self.frame['B']), + self.frame['B']) + + def test_getitem_iterator(self): + idx = iter(['A', 'B', 'C']) + result = self.frame.ix[:, idx] + expected = self.frame.ix[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + + def test_getitem_list(self): + self.frame.columns.name = 'foo' + + result = self.frame[['B', 'A']] + result2 = self.frame[Index(['B', 'A'])] + + expected = self.frame.ix[:, ['B', 'A']] + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + self.assertEqual(result.columns.name, 'foo') + + self.assertRaises(Exception, self.frame.__getitem__, + ['B', 'A', 'foo']) + self.assertRaises(Exception, self.frame.__getitem__, + Index(['B', 'A', 'foo'])) + + # tuples + df = DataFrame(randn(8, 3), + columns=Index([('foo', 'bar'), ('baz', 'qux'), + ('peek', 'aboo')], name='sth')) + + result = df[[('foo', 'bar'), ('baz', 'qux')]] + expected = df.ix[:, :2] + assert_frame_equal(result, expected) + self.assertEqual(result.columns.name, 'sth') + + def test_setitem_list(self): + self.frame['E'] = 'foo' + data = self.frame[['A', 'B']] + self.frame[['B', 'A']] = data + + assert_series_equal(self.frame['B'], data['A']) + assert_series_equal(self.frame['A'], data['B']) + + def test_setitem_list_not_dataframe(self): + data = np.random.randn(len(self.frame), 2) + self.frame[['A', 'B']] = data + assert_almost_equal(self.frame[['A', 'B']].values, data) + + def test_setitem_list_of_tuples(self): + tuples = zip(self.frame['A'], self.frame['B']) + self.frame['tuples'] = tuples + + result = self.frame['tuples'] + expected = Series(tuples, index=self.frame.index) + assert_series_equal(result, expected) + + def test_getitem_boolean(self): + # boolean indexing + d = self.tsframe.index[10] + indexer = self.tsframe.index > d + indexer_obj = indexer.astype(object) + + subindex = self.tsframe.index[indexer] + subframe = self.tsframe[indexer] + + self.assert_(np.array_equal(subindex, subframe.index)) + self.assertRaises(Exception, self.tsframe.__getitem__, indexer[:-1]) + + subframe_obj = self.tsframe[indexer_obj] + assert_frame_equal(subframe_obj, subframe) + + self.assertRaises(ValueError, self.tsframe.__getitem__, self.tsframe) + + + def test_getitem_boolean_list(self): + df = DataFrame(np.arange(12).reshape(3,4)) + def _checkit(lst): + result = df[lst] + expected = df.ix[df.index[lst]] + assert_frame_equal(result, expected) + + _checkit([True, False, True]) + _checkit([True, True, True]) + _checkit([False, False, False]) + + def test_getitem_boolean_iadd(self): + arr = randn(5, 5) + + df = DataFrame(arr.copy()) + df[df < 0] += 1 + + arr[arr < 0] += 1 + + assert_almost_equal(df.values, arr) + + def test_getattr(self): + tm.assert_series_equal(self.frame.A, self.frame['A']) + self.assertRaises(AttributeError, getattr, self.frame, + 'NONEXISTENT_NAME') + + def test_setattr_column(self): + df = DataFrame({'foobar' : 1}, index=range(10)) + + df.foobar = 5 + self.assert_((df.foobar == 5).all()) + + def test_setitem(self): + # not sure what else to do here + series = self.frame['A'][::2] + self.frame['col5'] = series + self.assert_('col5' in self.frame) + tm.assert_dict_equal(series, self.frame['col5'], + compare_keys=False) + + series = self.frame['A'] + self.frame['col6'] = series + tm.assert_dict_equal(series, self.frame['col6'], + compare_keys=False) + + self.assertRaises(Exception, self.frame.__setitem__, + randn(len(self.frame) + 1)) + + # set ndarray + arr = randn(len(self.frame)) + self.frame['col9'] = arr + self.assert_((self.frame['col9'] == arr).all()) + + self.frame['col7'] = 5 + assert((self.frame['col7'] == 5).all()) + + self.frame['col0'] = 3.14 + assert((self.frame['col0'] == 3.14).all()) + + self.frame['col8'] = 'foo' + assert((self.frame['col8'] == 'foo').all()) + + smaller = self.frame[:2] + smaller['col10'] = ['1', '2'] + self.assertEqual(smaller['col10'].dtype, np.object_) + self.assert_((smaller['col10'] == ['1', '2']).all()) + + def test_setitem_tuple(self): + self.frame['A', 'B'] = self.frame['A'] + assert_series_equal(self.frame['A', 'B'], self.frame['A']) + + def test_setitem_always_copy(self): + s = self.frame['A'].copy() + self.frame['E'] = s + + self.frame['E'][5:10] = nan + self.assert_(notnull(s[5:10]).all()) + + def test_setitem_boolean(self): + df = self.frame.copy() + values = self.frame.values + + df[df > 0] = 5 + values[values > 0] = 5 + assert_almost_equal(df.values, values) + + df[df == 5] = 0 + values[values == 5] = 0 + assert_almost_equal(df.values, values) + + self.assertRaises(Exception, df.__setitem__, df[:-1] > 0, 2) + self.assertRaises(Exception, df.__setitem__, df * 0, 2) + + # index with DataFrame + mask = df > np.abs(df) + expected = df.copy() + df[df > np.abs(df)] = nan + expected.values[mask.values] = nan + assert_frame_equal(df, expected) + + # set from DataFrame + expected = df.copy() + df[df > np.abs(df)] = df * 2 + np.putmask(expected.values, mask.values, df.values * 2) + assert_frame_equal(df, expected) + + def test_setitem_cast(self): + self.frame['D'] = self.frame['D'].astype('i8') + self.assert_(self.frame['D'].dtype == np.int64) + + # #669, should not cast? + self.frame['B'] = 0 + self.assert_(self.frame['B'].dtype == np.float64) + + # cast if pass array of course + self.frame['B'] = np.arange(len(self.frame)) + self.assert_(issubclass(self.frame['B'].dtype.type, np.integer)) + + self.frame['foo'] = 'bar' + self.frame['foo'] = 0 + self.assert_(self.frame['foo'].dtype == np.int64) + + self.frame['foo'] = 'bar' + self.frame['foo'] = 2.5 + self.assert_(self.frame['foo'].dtype == np.float64) + + self.frame['something'] = 0 + self.assert_(self.frame['something'].dtype == np.int64) + self.frame['something'] = 2 + self.assert_(self.frame['something'].dtype == np.int64) + self.frame['something'] = 2.5 + self.assert_(self.frame['something'].dtype == np.float64) + + def test_setitem_boolean_column(self): + expected = self.frame.copy() + mask = self.frame['A'] > 0 + + self.frame.ix[mask, 'B'] = 0 + expected.values[mask, 1] = 0 + + assert_frame_equal(self.frame, expected) + + def test_setitem_corner(self): + # corner case + df = DataFrame({'B' : [1., 2., 3.], + 'C' : ['a', 'b', 'c']}, + index=np.arange(3)) + del df['B'] + df['B'] = [1., 2., 3.] + self.assert_('B' in df) + self.assertEqual(len(df.columns), 2) + + df['A'] = 'beginning' + df['E'] = 'foo' + df['D'] = 'bar' + df[datetime.now()] = 'date' + df[datetime.now()] = 5. + + # what to do when empty frame with index + dm = DataFrame(index=self.frame.index) + dm['A'] = 'foo' + dm['B'] = 'bar' + self.assertEqual(len(dm.columns), 2) + self.assertEqual(dm.values.dtype, np.object_) + + dm['C'] = 1 + self.assertEqual(dm['C'].dtype, np.int64) + + # set existing column + dm['A'] = 'bar' + self.assertEqual('bar', dm['A'][0]) + + dm = DataFrame(index=np.arange(3)) + dm['A'] = 1 + dm['foo'] = 'bar' + del dm['foo'] + dm['foo'] = 'bar' + self.assertEqual(dm['foo'].dtype, np.object_) + + dm['coercable'] = ['1', '2', '3'] + self.assertEqual(dm['coercable'].dtype, np.object_) + + def test_setitem_corner2(self): + data = {"title" : ['foobar','bar','foobar'] + ['foobar'] * 17 , + "cruft" : np.random.random(20)} + + df = DataFrame(data) + ix = df[df['title'] == 'bar'].index + + df.ix[ix, ['title']] = 'foobar' + df.ix[ix, ['cruft']] = 0 + + assert( df.ix[1, 'title'] == 'foobar' ) + assert( df.ix[1, 'cruft'] == 0 ) + + def test_setitem_ambig(self): + # difficulties with mixed-type data + from decimal import Decimal + + # created as float type + dm = DataFrame(index=range(3), columns=range(3)) + + coercable_series = Series([Decimal(1) for _ in range(3)], + index=range(3)) + uncoercable_series = Series(['foo', 'bzr', 'baz'], index=range(3)) + + dm[0] = np.ones(3) + self.assertEqual(len(dm.columns), 3) + # self.assert_(dm.objects is None) + + dm[1] = coercable_series + self.assertEqual(len(dm.columns), 3) + # self.assert_(dm.objects is None) + + dm[2] = uncoercable_series + self.assertEqual(len(dm.columns), 3) + # self.assert_(dm.objects is not None) + self.assert_(dm[2].dtype == np.object_) + + def test_setitem_clear_caches(self): + # GH #304 + df = DataFrame({'x': [1.1, 2.1, 3.1, 4.1], 'y': [5.1, 6.1, 7.1, 8.1]}, + index=[0,1,2,3]) + df.insert(2, 'z', np.nan) + + # cache it + foo = df['z'] + + df.ix[2:, 'z'] = 42 + + expected = Series([np.nan, np.nan, 42, 42], index=df.index) + self.assert_(df['z'] is not foo) + assert_series_equal(df['z'], expected) + + def test_setitem_None(self): + # GH #766 + self.frame[None] = self.frame['A'] + assert_series_equal(self.frame[None], self.frame['A']) + repr(self.frame) + + def test_delitem_corner(self): + f = self.frame.copy() + del f['D'] + self.assertEqual(len(f.columns), 3) + self.assertRaises(KeyError, f.__delitem__, 'D') + del f['B'] + self.assertEqual(len(f.columns), 2) + + def test_getitem_fancy_2d(self): + f = self.frame + ix = f.ix + + assert_frame_equal(ix[:, ['B', 'A']], f.reindex(columns=['B', 'A'])) + + subidx = self.frame.index[[5, 4, 1]] + assert_frame_equal(ix[subidx, ['B', 'A']], + f.reindex(index=subidx, columns=['B', 'A'])) + + # slicing rows, etc. + assert_frame_equal(ix[5:10], f[5:10]) + assert_frame_equal(ix[5:10, :], f[5:10]) + assert_frame_equal(ix[:5, ['A', 'B']], + f.reindex(index=f.index[:5], columns=['A', 'B'])) + + # slice rows with labels, inclusive! + expected = ix[5:11] + result = ix[f.index[5]:f.index[10]] + assert_frame_equal(expected, result) + + # slice columns + assert_frame_equal(ix[:, :2], f.reindex(columns=['A', 'B'])) + + # get view + exp = f.copy() + ix[5:10].values[:] = 5 + exp.values[5:10] = 5 + assert_frame_equal(f, exp) + + self.assertRaises(ValueError, ix.__getitem__, f > 0.5) + + def test_slice_floats(self): + index = [52195.504153, 52196.303147, 52198.369883] + df = DataFrame(np.random.rand(3, 2), index=index) + + s1 = df.ix[52195.1:52196.5] + self.assertEquals(len(s1), 2) + + s1 = df.ix[52195.1:52196.6] + self.assertEquals(len(s1), 2) + + s1 = df.ix[52195.1:52198.9] + self.assertEquals(len(s1), 3) + + def test_getitem_fancy_slice_integers_step(self): + df = DataFrame(np.random.randn(10, 5)) + + # this is OK + result = df.ix[:8:2] + df.ix[:8:2] = np.nan + self.assert_(isnull(df.ix[:8:2]).values.all()) + + def test_getitem_setitem_integer_slice_keyerrors(self): + df = DataFrame(np.random.randn(10, 5), index=range(0, 20, 2)) + + # this is OK + cp = df.copy() + cp.ix[4:10] = 0 + self.assert_((cp.ix[4:10] == 0).values.all()) + + # so is this + cp = df.copy() + cp.ix[3:11] = 0 + self.assert_((cp.ix[3:11] == 0).values.all()) + + result = df.ix[4:10] + result2 = df.ix[3:11] + expected = df.reindex([4, 6, 8, 10]) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + # non-monotonic, raise KeyError + df2 = df[::-1] + self.assertRaises(KeyError, df2.ix.__getitem__, slice(3, 11)) + self.assertRaises(KeyError, df2.ix.__setitem__, slice(3, 11), 0) + + def test_setitem_fancy_2d(self): + f = self.frame + ix = f.ix + + # case 1 + frame = self.frame.copy() + expected = frame.copy() + frame.ix[:, ['B', 'A']] = 1 + expected['B'] = 1. + expected['A'] = 1. + assert_frame_equal(frame, expected) + + # case 2 + frame = self.frame.copy() + frame2 = self.frame.copy() + + expected = frame.copy() + + subidx = self.frame.index[[5, 4, 1]] + values = randn(3, 2) + + frame.ix[subidx, ['B', 'A']] = values + frame2.ix[[5, 4, 1], ['B', 'A']] = values + + expected['B'].ix[subidx] = values[:, 0] + expected['A'].ix[subidx] = values[:, 1] + + assert_frame_equal(frame, expected) + assert_frame_equal(frame2, expected) + + # case 3: slicing rows, etc. + frame = self.frame.copy() + + expected1 = self.frame.copy() + frame.ix[5:10] = 1. + expected1.values[5:10] = 1. + assert_frame_equal(frame, expected1) + + expected2 = self.frame.copy() + arr = randn(5, len(frame.columns)) + frame.ix[5:10] = arr + expected2.values[5:10] = arr + assert_frame_equal(frame, expected2) + + # case 4 + frame = self.frame.copy() + frame.ix[5:10, :] = 1. + assert_frame_equal(frame, expected1) + frame.ix[5:10, :] = arr + assert_frame_equal(frame, expected2) + + # case 5 + frame = self.frame.copy() + frame2 = self.frame.copy() + + expected = self.frame.copy() + values = randn(5, 2) + + frame.ix[:5, ['A', 'B']] = values + expected['A'][:5] = values[:, 0] + expected['B'][:5] = values[:, 1] + assert_frame_equal(frame, expected) + + frame2.ix[:5, [0, 1]] = values + assert_frame_equal(frame2, expected) + + # case 6: slice rows with labels, inclusive! + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[frame.index[5]:frame.index[10]] = 5. + expected.values[5:11] = 5 + assert_frame_equal(frame, expected) + + # case 7: slice columns + frame = self.frame.copy() + frame2 = self.frame.copy() + expected = self.frame.copy() + + # slice indices + frame.ix[:, 1:3] = 4. + expected.values[:, 1:3] = 4. + assert_frame_equal(frame, expected) + + # slice with labels + frame.ix[:, 'B':'C'] = 4. + assert_frame_equal(frame, expected) + + # new corner case of boolean slicing / setting + frame = DataFrame(zip([2,3,9,6,7], [np.nan]*5), + columns=['a','b']) + lst = [100] + lst.extend([np.nan]*4) + expected = DataFrame(zip([100,3,9,6,7], lst), columns=['a','b']) + frame[frame['a'] == 2] = 100 + assert_frame_equal(frame, expected) + + + def test_fancy_getitem_slice_mixed(self): + sliced = self.mixed_frame.ix[:, -3:] + self.assert_(sliced['D'].dtype == np.float64) + + # get view with single block + sliced = self.frame.ix[:, -3:] + sliced['C'] = 4. + self.assert_((self.frame['C'] == 4).all()) + + def test_fancy_setitem_int_labels(self): + # integer index defers to label-based indexing + + df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) + + tmp = df.copy() + exp = df.copy() + tmp.ix[[0, 2, 4]] = 5 + exp.values[:3] = 5 + assert_frame_equal(tmp, exp) + + tmp = df.copy() + exp = df.copy() + tmp.ix[6] = 5 + exp.values[3] = 5 + assert_frame_equal(tmp, exp) + + tmp = df.copy() + exp = df.copy() + tmp.ix[:, 2] = 5 + exp.values[:, 2] = 5 + assert_frame_equal(tmp, exp) + + def test_fancy_getitem_int_labels(self): + df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) + + result = df.ix[[4, 2, 0], [2, 0]] + expected = df.reindex(index=[4, 2, 0], columns=[2, 0]) + assert_frame_equal(result, expected) + + result = df.ix[[4, 2, 0]] + expected = df.reindex(index=[4, 2, 0]) + assert_frame_equal(result, expected) + + result = df.ix[4] + expected = df.xs(4) + assert_series_equal(result, expected) + + result = df.ix[:, 3] + expected = df[3] + assert_series_equal(result, expected) + + def test_fancy_index_int_labels_exceptions(self): + df = DataFrame(np.random.randn(10, 5), index=np.arange(0, 20, 2)) + + # labels that aren't contained + self.assertRaises(KeyError, df.ix.__setitem__, + ([0, 1, 2], [2, 3, 4]), 5) + + # try to set indices not contained in frame + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + ['foo', 'bar', 'baz'], 1) + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + (slice(None, None), ['E']), 1) + self.assertRaises(KeyError, + self.frame.ix.__setitem__, + (slice(None, None), 'E'), 1) + + def test_setitem_fancy_mixed_2d(self): + self.mixed_frame.ix[:5, ['C', 'B', 'A']] = 5 + result = self.mixed_frame.ix[:5, ['C', 'B', 'A']] + self.assert_((result.values == 5).all()) + + self.mixed_frame.ix[5] = np.nan + self.assert_(isnull(self.mixed_frame.ix[5]).all()) + + self.mixed_frame.ix[5] = self.mixed_frame.ix[6] + assert_series_equal(self.mixed_frame.ix[5], self.mixed_frame.ix[6]) + + # #1432 + df = DataFrame({1: [1., 2., 3.], + 2: [3, 4, 5]}) + self.assert_(df._is_mixed_type) + + df.ix[1] = [5, 10] + + expected = DataFrame({1: [1., 5., 3.], + 2: [3, 10, 5]}) + + assert_frame_equal(df, expected) + + def test_getitem_setitem_non_ix_labels(self): + df = tm.makeTimeDataFrame() + + start, end = df.index[[5, 10]] + + result = df.ix[start:end] + result2 = df[start:end] + expected = df[5:11] + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_ix_assign_column_mixed(self): + # GH #1142 + orig = self.mixed_frame.ix[:, 'B'].copy() + self.mixed_frame.ix[:, 'B'] = self.mixed_frame.ix[:, 'B'] + 1 + assert_series_equal(self.mixed_frame.B, orig + 1) + + def test_ix_multi_take(self): + df = DataFrame(np.random.randn(3, 2)) + rs = df.ix[df.index==0, :] + xp = df.reindex([0]) + assert_frame_equal(rs, xp) + + """ #1321 + df = DataFrame(np.random.randn(3, 2)) + rs = df.ix[df.index==0, df.columns==1] + xp = df.reindex([0], [1]) + assert_frame_equal(rs, xp) + """ + + def test_ix_multi_take_nonint_index(self): + df = DataFrame(np.random.randn(3, 2), index=['x','y','z'], + columns=['a','b']) + rs = df.ix[[0], [0]] + xp = df.reindex(['x'], columns=['a']) + assert_frame_equal(rs, xp) + + def test_ix_multi_take_multiindex(self): + df = DataFrame(np.random.randn(3, 2), index=['x','y','z'], + columns=[['a','b'], ['1','2']]) + rs = df.ix[[0], [0]] + xp = df.reindex(['x'], columns=[('a', '1')]) + assert_frame_equal(rs, xp) + + def test_getitem_fancy_1d(self): + f = self.frame + ix = f.ix + + # return self if no slicing...for now + self.assert_(ix[:, :] is f) + + # low dimensional slice + xs1 = ix[2, ['C', 'B', 'A']] + xs2 = f.xs(f.index[2]).reindex(['C', 'B', 'A']) + assert_series_equal(xs1, xs2) + + ts1 = ix[5:10, 2] + ts2 = f[f.columns[2]][5:10] + assert_series_equal(ts1, ts2) + + # positional xs + xs1 = ix[0] + xs2 = f.xs(f.index[0]) + assert_series_equal(xs1, xs2) + + xs1 = ix[f.index[5]] + xs2 = f.xs(f.index[5]) + assert_series_equal(xs1, xs2) + + # single column + assert_series_equal(ix[:, 'A'], f['A']) + + # return view + exp = f.copy() + exp.values[5] = 4 + ix[5][:] = 4 + assert_frame_equal(exp, f) + + exp.values[:, 1] = 6 + ix[:, 1][:] = 6 + assert_frame_equal(exp, f) + + # slice of mixed-frame + xs = self.mixed_frame.ix[5] + exp = self.mixed_frame.xs(self.mixed_frame.index[5]) + assert_series_equal(xs, exp) + + def test_setitem_fancy_1d(self): + # case 1: set cross-section for indices + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[2, ['C', 'B', 'A']] = [1., 2., 3.] + expected['C'][2] = 1. + expected['B'][2] = 2. + expected['A'][2] = 3. + assert_frame_equal(frame, expected) + + frame2 = self.frame.copy() + frame2.ix[2, [3, 2, 1]] = [1., 2., 3.] + assert_frame_equal(frame, expected) + + # case 2, set a section of a column + frame = self.frame.copy() + expected = self.frame.copy() + + vals = randn(5) + expected.values[5:10, 2] = vals + frame.ix[5:10, 2] = vals + assert_frame_equal(frame, expected) + + frame2 = self.frame.copy() + frame2.ix[5:10, 'B'] = vals + assert_frame_equal(frame, expected) + + # case 3: full xs + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[4] = 5. + expected.values[4] = 5. + assert_frame_equal(frame, expected) + + frame.ix[frame.index[4]] = 6. + expected.values[4] = 6. + assert_frame_equal(frame, expected) + + # single column + frame = self.frame.copy() + expected = self.frame.copy() + + frame.ix[:, 'A'] = 7. + expected['A'] = 7. + assert_frame_equal(frame, expected) + + def test_getitem_fancy_scalar(self): + f = self.frame + ix = f.ix + # individual value + for col in f.columns: + ts = f[col] + for idx in f.index[::5]: + assert_almost_equal(ix[idx, col], ts[idx]) + + def test_setitem_fancy_scalar(self): + f = self.frame + expected = self.frame.copy() + ix = f.ix + # individual value + for j, col in enumerate(f.columns): + ts = f[col] + for idx in f.index[::5]: + i = f.index.get_loc(idx) + val = randn() + expected.values[i,j] = val + ix[idx, col] = val + assert_frame_equal(f, expected) + + def test_getitem_fancy_boolean(self): + f = self.frame + ix = f.ix + + expected = f.reindex(columns=['B', 'D']) + result = ix[:, [False, True, False, True]] + assert_frame_equal(result, expected) + + expected = f.reindex(index=f.index[5:10], columns=['B', 'D']) + result = ix[5:10, [False, True, False, True]] + assert_frame_equal(result, expected) + + boolvec = f.index > f.index[7] + expected = f.reindex(index=f.index[boolvec]) + result = ix[boolvec] + assert_frame_equal(result, expected) + result = ix[boolvec, :] + assert_frame_equal(result, expected) + + result = ix[boolvec, 2:] + expected = f.reindex(index=f.index[boolvec], + columns=['C', 'D']) + assert_frame_equal(result, expected) + + def test_setitem_fancy_boolean(self): + # from 2d, set with booleans + frame = self.frame.copy() + expected = self.frame.copy() + + mask = frame['A'] > 0 + frame.ix[mask] = 0. + expected.values[mask] = 0. + assert_frame_equal(frame, expected) + + frame = self.frame.copy() + expected = self.frame.copy() + frame.ix[mask, ['A', 'B']] = 0. + expected.values[mask, :2] = 0. + assert_frame_equal(frame, expected) + + def test_getitem_fancy_ints(self): + result = self.frame.ix[[1,4,7]] + expected = self.frame.ix[self.frame.index[[1,4,7]]] + assert_frame_equal(result, expected) + + result = self.frame.ix[:, [2, 0, 1]] + expected = self.frame.ix[:, self.frame.columns[[2, 0, 1]]] + assert_frame_equal(result, expected) + + def test_getitem_setitem_fancy_exceptions(self): + ix = self.frame.ix + self.assertRaises(Exception, ix.__getitem__, + (slice(None, None, None), + slice(None, None, None), + slice(None, None, None))) + self.assertRaises(Exception, ix.__setitem__, + (slice(None, None, None), + slice(None, None, None), + slice(None, None, None)), 1) + + def test_getitem_setitem_boolean_misaligned(self): + # boolean index misaligned labels + mask = self.frame['A'][::-1] > 1 + + result = self.frame.ix[mask] + expected = self.frame.ix[mask[::-1]] + assert_frame_equal(result, expected) + + cp = self.frame.copy() + expected = self.frame.copy() + cp.ix[mask] = 0 + expected.ix[mask] = 0 + assert_frame_equal(cp, expected) + + def test_getitem_setitem_boolean_multi(self): + df = DataFrame(np.random.randn(3, 2)) + + # get + k1 = np.array([True, False, True]) + k2 = np.array([False, True]) + result = df.ix[k1, k2] + expected = df.ix[[0, 2], [1]] + assert_frame_equal(result, expected) + + expected = df.copy() + df.ix[np.array([True, False, True]), + np.array([False, True])] = 5 + expected.ix[[0, 2], [1]] = 5 + assert_frame_equal(df, expected) + + def test_getitem_setitem_float_labels(self): + index = Index([1.5, 2, 3, 4, 5]) + df = DataFrame(np.random.randn(5, 5), index=index) + + result = df.ix[1.5:4] + expected = df.reindex([1.5, 2, 3, 4]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 4) + + result = df.ix[4:5] + expected = df.reindex([4, 5]) + assert_frame_equal(result, expected) + self.assertEqual(len(result), 2) + + # this should raise an exception + self.assertRaises(Exception, df.ix.__getitem__, slice(1, 2)) + self.assertRaises(Exception, df.ix.__setitem__, slice(1, 2), 0) + + def test_setitem_single_column_mixed(self): + df = DataFrame(randn(5, 3), index=['a', 'b', 'c', 'd', 'e'], + columns=['foo', 'bar', 'baz']) + df['str'] = 'qux' + df.ix[::2, 'str'] = nan + expected = [nan, 'qux', nan, 'qux', nan] + assert_almost_equal(df['str'].values, expected) + + def test_setitem_frame(self): + piece = self.frame.ix[:2, ['A', 'B']] + self.frame.ix[-2:, ['A', 'B']] = piece + assert_almost_equal(self.frame.ix[-2:, ['A', 'B']].values, + piece.values) + + piece = self.mixed_frame.ix[:2, ['A', 'B']] + f = self.mixed_frame.ix.__setitem__ + key = (slice(-2, None), ['A', 'B']) + self.assertRaises(ValueError, f, key, piece) + + def test_setitem_fancy_exceptions(self): + pass + + def test_getitem_boolean_missing(self): + pass + + def test_setitem_boolean_missing(self): + pass + + def test_getitem_setitem_ix_duplicates(self): + # #1201 + df = DataFrame(np.random.randn(5, 3), + index=['foo', 'foo', 'bar', 'baz', 'bar']) + + result = df.ix['foo'] + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.ix['bar'] + expected = df.ix[[2, 4]] + assert_frame_equal(result, expected) + + result = df.ix['baz'] + expected = df.ix[3] + assert_series_equal(result, expected) + + def test_get_value(self): + for idx in self.frame.index: + for col in self.frame.columns: + result = self.frame.get_value(idx, col) + expected = self.frame[col][idx] + assert_almost_equal(result, expected) + + def test_lookup(self): + def alt(df, rows, cols): + result = [] + for r, c in zip(rows, cols): + result.append(df.get_value(r, c)) + return result + + def testit(df): + rows = list(df.index) * len(df.columns) + cols = list(df.columns) * len(df.index) + result = df.lookup(rows, cols) + expected = alt(df, rows, cols) + assert_almost_equal(result, expected) + + testit(self.mixed_frame) + testit(self.frame) + + df = DataFrame({'label' : ['a', 'b', 'a', 'c'], + 'mask_a' : [True, True, False, True], + 'mask_b' : [True, False, False, False], + 'mask_c' : [False, True, False, True]}) + df['mask'] = df.lookup(df.index, 'mask_' + df['label']) + exp_mask = alt(df, df.index, 'mask_' + df['label']) + assert_almost_equal(df['mask'], exp_mask) + self.assert_(df['mask'].dtype == np.bool_) + + self.assertRaises(ValueError, self.frame.lookup, + ['xyz'], ['A']) + + self.assertRaises(ValueError, self.frame.lookup, + [self.frame.index[0]], ['xyz']) + + def test_set_value(self): + for idx in self.frame.index: + for col in self.frame.columns: + self.frame.set_value(idx, col, 1) + assert_almost_equal(self.frame[col][idx], 1) + + def test_set_value_resize(self): + res = self.frame.set_value('foobar', 'B', 0) + self.assert_(res is not self.frame) + self.assert_(res.index[-1] == 'foobar') + self.assertEqual(res.get_value('foobar', 'B'), 0) + + res2 = res.set_value('foobar', 'qux', 0) + self.assert_(res2 is not res) + self.assert_(np.array_equal(res2.columns, + list(self.frame.columns) + ['qux'])) + self.assertEqual(res2.get_value('foobar', 'qux'), 0) + + res3 = res.set_value('foobar', 'baz', 'sam') + self.assert_(res3['baz'].dtype == np.object_) + + res3 = res.set_value('foobar', 'baz', True) + self.assert_(res3['baz'].dtype == np.object_) + + res3 = res.set_value('foobar', 'baz', 5) + self.assert_(com.is_float_dtype(res3['baz'])) + self.assert_(isnull(res3['baz'].drop(['foobar'])).values.all()) + self.assertRaises(ValueError, res3.set_value, 'foobar', 'baz', 'sam') + + def test_set_value_with_index_dtype_change(self): + df = DataFrame(randn(3,3), index=range(3), columns=list('ABC')) + res = df.set_value('C', 2, 1.0) + self.assert_(list(res.index) == list(df.index) + ['C']) + self.assert_(list(res.columns) == list(df.columns) + [2]) + + def test_get_set_value_no_partial_indexing(self): + # partial w/ MultiIndex raise exception + index = MultiIndex.from_tuples([(0, 1), (0, 2), (1, 1), (1, 2)]) + df = DataFrame(index=index, columns=range(4)) + self.assertRaises(KeyError, df.get_value, 0, 1) + # self.assertRaises(KeyError, df.set_value, 0, 1, 0) + + def test_single_element_ix_dont_upcast(self): + self.frame['E'] = 1 + self.assert_(issubclass(self.frame['E'].dtype.type, + (int, np.integer))) + + result = self.frame.ix[self.frame.index[5], 'E'] + self.assert_(com.is_integer(result)) + + def test_irow(self): + df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2)) + + result = df.irow(1) + exp = df.ix[2] + assert_series_equal(result, exp) + + result = df.irow(2) + exp = df.ix[4] + assert_series_equal(result, exp) + + # slice + result = df.irow(slice(4, 8)) + expected = df.ix[8:14] + assert_frame_equal(result, expected) + + # verify slice is view + result[2] = 0. + exp_col = df[2].copy() + exp_col[4:8] = 0. + assert_series_equal(df[2], exp_col) + + # list of integers + result = df.irow([1, 2, 4, 6]) + expected = df.reindex(df.index[[1, 2, 4, 6]]) + assert_frame_equal(result, expected) + + def test_icol(self): + df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2)) + + result = df.icol(1) + exp = df.ix[:, 2] + assert_series_equal(result, exp) + + result = df.icol(2) + exp = df.ix[:, 4] + assert_series_equal(result, exp) + + # slice + result = df.icol(slice(4, 8)) + expected = df.ix[:, 8:14] + assert_frame_equal(result, expected) + + # verify slice is view + result[8] = 0. + self.assert_((df[8] == 0).all()) + + # list of integers + result = df.icol([1, 2, 4, 6]) + expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) + assert_frame_equal(result, expected) + + def test_irow_icol_duplicates(self): + df = DataFrame(np.random.rand(3,3), columns=list('ABC'), + index=list('aab')) + + result = df.irow(0) + result2 = df.ix[0] + self.assert_(isinstance(result, Series)) + assert_almost_equal(result.values, df.values[0]) + assert_series_equal(result, result2) + + result = df.T.icol(0) + result2 = df.T.ix[:, 0] + self.assert_(isinstance(result, Series)) + assert_almost_equal(result.values, df.values[0]) + assert_series_equal(result, result2) + + def test_iget_value(self): + for i, row in enumerate(self.frame.index): + for j, col in enumerate(self.frame.columns): + result = self.frame.iget_value(i, j) + expected = self.frame.get_value(row, col) + assert_almost_equal(result, expected) + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() + +_frame = DataFrame(_seriesd) +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A']) +_intframe = DataFrame(dict((k, v.astype(int)) + for k, v in _seriesd.iteritems())) + +_tsframe = DataFrame(_tsd) + +_mixed_frame = _frame.copy() +_mixed_frame['foo'] = 'bar' + +class SafeForSparse(object): + + def test_getitem_pop_assign_name(self): + s = self.frame['A'] + self.assertEqual(s.name, 'A') + + s = self.frame.pop('A') + self.assertEqual(s.name, 'A') + + s = self.frame.ix[:, 'B'] + self.assertEqual(s.name, 'B') + + s2 = s.ix[:] + self.assertEqual(s2.name, 'B') + + def test_get_value(self): + for idx in self.frame.index: + for col in self.frame.columns: + result = self.frame.get_value(idx, col) + expected = self.frame[col][idx] + assert_almost_equal(result, expected) + + def test_join_index(self): + # left / right + + f = self.frame.reindex(columns=['A', 'B'])[:10] + f2 = self.frame.reindex(columns=['C', 'D']) + + joined = f.join(f2) + self.assert_(f.index.equals(joined.index)) + self.assertEqual(len(joined.columns), 4) + + joined = f.join(f2, how='left') + self.assert_(joined.index.equals(f.index)) + self.assertEqual(len(joined.columns), 4) + + joined = f.join(f2, how='right') + self.assert_(joined.index.equals(f2.index)) + self.assertEqual(len(joined.columns), 4) + + # corner case + self.assertRaises(Exception, self.frame.join, self.frame, + how='left') + + # inner + + f = self.frame.reindex(columns=['A', 'B'])[:10] + f2 = self.frame.reindex(columns=['C', 'D']) + + joined = f.join(f2, how='inner') + self.assert_(joined.index.equals(f.index.intersection(f2.index))) + self.assertEqual(len(joined.columns), 4) + + # corner case + self.assertRaises(Exception, self.frame.join, self.frame, + how='inner') + + # outer + + f = self.frame.reindex(columns=['A', 'B'])[:10] + f2 = self.frame.reindex(columns=['C', 'D']) + + joined = f.join(f2, how='outer') + self.assert_(tm.equalContents(self.frame.index, joined.index)) + self.assertEqual(len(joined.columns), 4) + + # corner case + self.assertRaises(Exception, self.frame.join, self.frame, + how='outer') + + self.assertRaises(Exception, f.join, f2, how='foo') + + def test_join_index_more(self): + af = self.frame.ix[:, ['A', 'B']] + bf = self.frame.ix[::2, ['C', 'D']] + + expected = af.copy() + expected['C'] = self.frame['C'][::2] + expected['D'] = self.frame['D'][::2] + + result = af.join(bf) + assert_frame_equal(result, expected) + + result = af.join(bf, how='right') + assert_frame_equal(result, expected[::2]) + + result = bf.join(af, how='right') + assert_frame_equal(result, expected.ix[:, result.columns]) + + def test_join_index_series(self): + df = self.frame.copy() + s = df.pop(self.frame.columns[-1]) + joined = df.join(s) + assert_frame_equal(joined, self.frame) + + s.name = None + self.assertRaises(Exception, df.join, s) + + def test_join_overlap(self): + df1 = self.frame.ix[:, ['A', 'B', 'C']] + df2 = self.frame.ix[:, ['B', 'C', 'D']] + + joined = df1.join(df2, lsuffix='_df1', rsuffix='_df2') + df1_suf = df1.ix[:, ['B', 'C']].add_suffix('_df1') + df2_suf = df2.ix[:, ['B', 'C']].add_suffix('_df2') + no_overlap = self.frame.ix[:, ['A', 'D']] + expected = df1_suf.join(df2_suf).join(no_overlap) + + # column order not necessarily sorted + assert_frame_equal(joined, expected.ix[:, joined.columns]) + + def test_add_prefix_suffix(self): + with_prefix = self.frame.add_prefix('foo#') + expected = ['foo#%s' % c for c in self.frame.columns] + self.assert_(np.array_equal(with_prefix.columns, expected)) + + with_suffix = self.frame.add_suffix('#foo') + expected = ['%s#foo' % c for c in self.frame.columns] + self.assert_(np.array_equal(with_suffix.columns, expected)) + + +class TestDataFrame(unittest.TestCase, CheckIndexing, + SafeForSparse): + klass = DataFrame + + def setUp(self): + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.intframe = _intframe.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + self.ts1 = tm.makeTimeSeries() + self.ts2 = tm.makeTimeSeries()[5:] + self.ts3 = tm.makeTimeSeries()[-5:] + self.ts4 = tm.makeTimeSeries()[1:-1] + + self.ts_dict = { + 'col1' : self.ts1, + 'col2' : self.ts2, + 'col3' : self.ts3, + 'col4' : self.ts4, + } + self.empty = DataFrame({}) + + arr = np.array([[1., 2., 3.], + [4., 5., 6.], + [7., 8., 9.]]) + + self.simple = DataFrame(arr, columns=['one', 'two', 'three'], + index=['a', 'b', 'c']) + + def test_get_axis(self): + self.assert_(DataFrame._get_axis_name(0) == 'index') + self.assert_(DataFrame._get_axis_name(1) == 'columns') + self.assert_(DataFrame._get_axis_name('index') == 'index') + self.assert_(DataFrame._get_axis_name('columns') == 'columns') + self.assertRaises(Exception, DataFrame._get_axis_name, 'foo') + self.assertRaises(Exception, DataFrame._get_axis_name, None) + + self.assert_(DataFrame._get_axis_number(0) == 0) + self.assert_(DataFrame._get_axis_number(1) == 1) + self.assert_(DataFrame._get_axis_number('index') == 0) + self.assert_(DataFrame._get_axis_number('columns') == 1) + self.assertRaises(Exception, DataFrame._get_axis_number, 2) + self.assertRaises(Exception, DataFrame._get_axis_number, None) + + self.assert_(self.frame._get_axis(0) is self.frame.index) + self.assert_(self.frame._get_axis(1) is self.frame.columns) + + def test_set_index(self): + idx = Index(np.arange(len(self.mixed_frame))) + + # cache it + _ = self.mixed_frame['foo'] + self.mixed_frame.index = idx + self.assert_(self.mixed_frame['foo'].index is idx) + self.assertRaises(Exception, setattr, self.mixed_frame, 'index', + idx[::2]) + + def test_set_index2(self): + df = DataFrame({'A' : ['foo', 'foo', 'foo', 'bar', 'bar'], + 'B' : ['one', 'two', 'three', 'one', 'two'], + 'C' : ['a', 'b', 'c', 'd', 'e'], + 'D' : np.random.randn(5), + 'E' : np.random.randn(5)}) + + # new object, single-column + result = df.set_index('C') + result_nodrop = df.set_index('C', drop=False) + + index = Index(df['C'], name='C') + + expected = df.ix[:, ['A', 'B', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.name, index.name) + + # inplace, single + df2 = df.copy() + df2.set_index('C', inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index('C', drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + + # create new object, multi-column + result = df.set_index(['A', 'B']) + result_nodrop = df.set_index(['A', 'B'], drop=False) + + index = MultiIndex.from_arrays([df['A'], df['B']], names=['A', 'B']) + + expected = df.ix[:, ['C', 'D', 'E']] + expected.index = index + + expected_nodrop = df.copy() + expected_nodrop.index = index + + assert_frame_equal(result, expected) + assert_frame_equal(result_nodrop, expected_nodrop) + self.assertEqual(result.index.names, index.names) + + # inplace + df2 = df.copy() + df2.set_index(['A', 'B'], inplace=True) + assert_frame_equal(df2, expected) + + df3 = df.copy() + df3.set_index(['A', 'B'], drop=False, inplace=True) + assert_frame_equal(df3, expected_nodrop) + + # corner case + self.assertRaises(Exception, df.set_index, 'A', verify_integrity=True) + + def test_set_index_pass_arrays(self): + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) + + # multiple columns + result = df.set_index(['A', df['B'].values], drop=False) + expected = df.set_index(['A', 'B'], drop=False) + assert_frame_equal(result, expected) + + def test_set_index_cast_datetimeindex(self): + df = DataFrame({'A' : [datetime(2000, 1, 1) + timedelta(i) + for i in range(1000)], + 'B' : np.random.randn(1000)}) + + idf = df.set_index('A') + self.assert_(isinstance(idf.index, DatetimeIndex)) + + def test_set_columns(self): + cols = Index(np.arange(len(self.mixed_frame.columns))) + self.mixed_frame.columns = cols + self.assertRaises(Exception, setattr, self.mixed_frame, 'columns', + cols[::2]) + + def test_keys(self): + getkeys = self.frame.keys + self.assert_(getkeys() is self.frame.columns) + + def test_column_contains_typeerror(self): + try: + self.frame.columns in self.frame + except TypeError: + pass + + def test_constructor(self): + df = DataFrame() + self.assert_(len(df.index) == 0) + + df = DataFrame(data={}) + self.assert_(len(df.index) == 0) + + def test_list_to_sdict(self): + from pandas.core.frame import _list_to_sdict + + d, c = _list_to_sdict([], None) + self.assertEquals(d, {}) + self.assertEquals(c, []) + + d, c = _list_to_sdict([], []) + self.assertEquals(d, {}) + self.assertEquals(c, []) + + def test_constructor_mixed(self): + index, data = tm.getMixedTypeDict() + + indexed_frame = DataFrame(data, index=index) + unindexed_frame = DataFrame(data) + + self.assertEqual(self.mixed_frame['foo'].dtype, np.object_) + + def test_constructor_cast_failure(self): + foo = DataFrame({'a': ['a', 'b', 'c']}, dtype=np.float64) + self.assert_(foo['a'].dtype == object) + + def test_constructor_rec(self): + rec = self.frame.to_records(index=False) + + # Assigning causes segfault in NumPy < 1.5.1 + # rec.dtype.names = list(rec.dtype.names)[::-1] + + index = self.frame.index + + df = DataFrame(rec) + self.assert_(np.array_equal(df.columns, rec.dtype.names)) + + df2 = DataFrame(rec, index=index) + self.assert_(np.array_equal(df2.columns, rec.dtype.names)) + self.assert_(df2.index.equals(index)) + + rng = np.arange(len(rec))[::-1] + df3 = DataFrame(rec, index=rng, columns=['C', 'B']) + expected = DataFrame(rec, index=rng).reindex(columns=['C', 'B']) + assert_frame_equal(df3, expected) + + def test_constructor_bool(self): + df = DataFrame({0 : np.ones(10, dtype=bool), + 1 : np.zeros(10, dtype=bool)}) + self.assertEqual(df.values.dtype, np.bool_) + + def test_is_mixed_type(self): + self.assert_(not self.frame._is_mixed_type) + self.assert_(self.mixed_frame._is_mixed_type) + + def test_constructor_dict(self): + frame = DataFrame({'col1' : self.ts1, + 'col2' : self.ts2}) + + tm.assert_dict_equal(self.ts1, frame['col1'], compare_keys=False) + tm.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False) + + frame = DataFrame({'col1' : self.ts1, + 'col2' : self.ts2}, + columns=['col2', 'col3', 'col4']) + + self.assertEqual(len(frame), len(self.ts2)) + self.assert_('col1' not in frame) + self.assert_(np.isnan(frame['col3']).all()) + + # Corner cases + self.assertEqual(len(DataFrame({})), 0) + self.assertRaises(Exception, lambda x: DataFrame([self.ts1, self.ts2])) + + # mix dict and array, wrong size + self.assertRaises(Exception, DataFrame, + {'A' : {'a' : 'a', 'b' : 'b'}, + 'B' : ['a', 'b', 'c']}) + + + # Length-one dict micro-optimization + frame = DataFrame({'A' : {'1' : 1, '2' : 2}}) + self.assert_(np.array_equal(frame.index, ['1', '2'])) + + # empty dict plus index + idx = Index([0, 1, 2]) + frame = DataFrame({}, index=idx) + self.assert_(frame.index is idx) + + # empty with index and columns + idx = Index([0, 1, 2]) + frame = DataFrame({}, index=idx, columns=idx) + self.assert_(frame.index is idx) + self.assert_(frame.columns is idx) + self.assertEqual(len(frame._series), 3) + + # with dict of empty list and Series + frame = DataFrame({'A' : [], 'B' : []}, columns=['A', 'B']) + self.assert_(frame.index.equals(Index([]))) + + def test_constructor_subclass_dict(self): + # Test for passing dict subclass to constructor + data = {'col1': tm.TestSubDict((x, 10.0 * x) for x in xrange(10)), + 'col2': tm.TestSubDict((x, 20.0 * x) for x in xrange(10))} + df = DataFrame(data) + refdf = DataFrame(dict((col, dict(val.iteritems())) + for col, val in data.iteritems())) + assert_frame_equal(refdf, df) + + data = tm.TestSubDict(data.iteritems()) + df = DataFrame(data) + assert_frame_equal(refdf, df) + + # try with defaultdict + from collections import defaultdict + data = {} + self.frame['B'][:10] = np.nan + for k, v in self.frame.iterkv(): + dct = defaultdict(dict) + dct.update(v.to_dict()) + data[k] = dct + frame = DataFrame(data) + assert_frame_equal(self.frame.sort_index(), frame) + + def test_constructor_dict_block(self): + expected = [[4., 3., 2., 1.]] + df = DataFrame({'d' : [4.],'c' : [3.],'b' : [2.],'a' : [1.]}, + columns=['d', 'c', 'b', 'a']) + assert_almost_equal(df.values, expected) + + def test_constructor_dict_cast(self): + # cast float tests + test_data = { + 'A' : {'1' : 1, '2' : 2}, + 'B' : {'1' : '1', '2' : '2', '3' : '3'}, + } + frame = DataFrame(test_data, dtype=float) + self.assertEqual(len(frame), 3) + self.assert_(frame['B'].dtype == np.float64) + self.assert_(frame['A'].dtype == np.float64) + + frame = DataFrame(test_data) + self.assertEqual(len(frame), 3) + self.assert_(frame['B'].dtype == np.object_) + self.assert_(frame['A'].dtype == np.float64) + + # can't cast to float + test_data = { + 'A' : dict(zip(range(20), tm.makeStringIndex(20))), + 'B' : dict(zip(range(15), randn(15))) + } + frame = DataFrame(test_data, dtype=float) + self.assertEqual(len(frame), 20) + self.assert_(frame['A'].dtype == np.object_) + self.assert_(frame['B'].dtype == np.float64) + + def test_constructor_dict_dont_upcast(self): + d = {'Col1': {'Row1': 'A String', 'Row2': np.nan}} + df = DataFrame(d) + self.assert_(isinstance(df['Col1']['Row2'], float)) + + dm = DataFrame([[1,2],['a','b']], index=[1,2], columns=[1,2]) + self.assert_(isinstance(dm[1][1], int)) + + def test_constructor_dict_of_tuples(self): + # GH #1491 + data = {'a': (1, 2, 3), 'b': (4, 5, 6)} + + result = DataFrame(data) + expected = DataFrame(dict((k, list(v)) for k, v in data.iteritems())) + assert_frame_equal(result, expected) + + def test_constructor_ndarray(self): + mat = np.zeros((2, 3), dtype=float) + + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=int) + self.assert_(frame.values.dtype == np.int64) + + # 1-D input + frame = DataFrame(np.zeros(3), columns=['A'], index=[1, 2, 3]) + self.assertEqual(len(frame.index), 3) + self.assertEqual(len(frame.columns), 1) + + frame = DataFrame(['foo', 'bar'], index=[0, 1], columns=['A']) + self.assertEqual(len(frame), 2) + + # higher dim raise exception + self.assertRaises(Exception, DataFrame, np.zeros((3, 3, 3)), + columns=['A', 'B', 'C'], index=[1]) + + # wrong size axis labels + self.assertRaises(Exception, DataFrame, mat, + columns=['A', 'B', 'C'], index=[1]) + + self.assertRaises(Exception, DataFrame, mat, + columns=['A', 'B'], index=[1, 2]) + + # automatic labeling + frame = DataFrame(mat) + self.assert_(np.array_equal(frame.index, range(2))) + self.assert_(np.array_equal(frame.columns, range(3))) + + frame = DataFrame(mat, index=[1, 2]) + self.assert_(np.array_equal(frame.columns, range(3))) + + frame = DataFrame(mat, columns=['A', 'B', 'C']) + self.assert_(np.array_equal(frame.index, range(2))) + + # 0-length axis + frame = DataFrame(np.empty((0, 3))) + self.assert_(len(frame.index) == 0) + + frame = DataFrame(np.empty((3, 0))) + self.assert_(len(frame.columns) == 0) + + def test_constructor_maskedarray(self): + mat = ma.masked_all((2, 3), dtype=float) + + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=int) + self.assert_(frame.values.dtype == np.int64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = 1.0 + mat2[1,2] = 2.0 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1.0, frame['A'][1]) + self.assertEqual(2.0, frame['C'][2]) + + # 1-D input + frame = DataFrame(ma.masked_all((3,)), columns=['A'], index=[1, 2, 3]) + self.assertEqual(len(frame.index), 3) + self.assertEqual(len(frame.columns), 1) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # higher dim raise exception + self.assertRaises(Exception, DataFrame, ma.masked_all((3, 3, 3)), + columns=['A', 'B', 'C'], index=[1]) + + # wrong size axis labels + self.assertRaises(Exception, DataFrame, mat, + columns=['A', 'B', 'C'], index=[1]) + + self.assertRaises(Exception, DataFrame, mat, + columns=['A', 'B'], index=[1, 2]) + + # automatic labeling + frame = DataFrame(mat) + self.assert_(np.array_equal(frame.index, range(2))) + self.assert_(np.array_equal(frame.columns, range(3))) + + frame = DataFrame(mat, index=[1, 2]) + self.assert_(np.array_equal(frame.columns, range(3))) + + frame = DataFrame(mat, columns=['A', 'B', 'C']) + self.assert_(np.array_equal(frame.index, range(2))) + + # 0-length axis + frame = DataFrame(ma.masked_all((0, 3))) + self.assert_(len(frame.index) == 0) + + frame = DataFrame(ma.masked_all((3, 0))) + self.assert_(len(frame.columns) == 0) + + def test_constructor_maskedarray_nonfloat(self): + # masked int promoted to float + mat = ma.masked_all((2, 3), dtype=int) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=float) + self.assert_(frame.values.dtype == np.float64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = 1 + mat2[1,2] = 2 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1, frame['A'][1]) + self.assertEqual(2, frame['C'][2]) + + # masked np.datetime64 stays (use lib.NaT as null) + mat = ma.masked_all((2, 3), dtype='M8[ns]') + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(isnull(frame).values.all()) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=np.int64) + self.assert_(frame.values.dtype == np.int64) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = 1 + mat2[1,2] = 2 + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(1, frame['A'].view('i8')[1]) + self.assertEqual(2, frame['C'].view('i8')[2]) + + # masked bool promoted to object + mat = ma.masked_all((2, 3), dtype=bool) + # 2-D input + frame = DataFrame(mat, columns=['A', 'B', 'C'], index=[1, 2]) + + self.assertEqual(len(frame.index), 2) + self.assertEqual(len(frame.columns), 3) + self.assertTrue(np.all(~np.asarray(frame == frame))) + + # cast type + frame = DataFrame(mat, columns=['A', 'B', 'C'], + index=[1, 2], dtype=object) + self.assert_(frame.values.dtype == object) + + # Check non-masked values + mat2 = ma.copy(mat) + mat2[0,0] = True + mat2[1,2] = False + frame = DataFrame(mat2, columns=['A', 'B', 'C'], index=[1, 2]) + self.assertEqual(True, frame['A'][1]) + self.assertEqual(False, frame['C'][2]) + + def test_constructor_corner(self): + df = DataFrame(index=[]) + self.assertEqual(df.values.shape, (0, 0)) + + # empty but with specified dtype + df = DataFrame(index=range(10), columns=['a','b'], dtype=object) + self.assert_(df.values.dtype == np.object_) + + # does not error but ends up float + df = DataFrame(index=range(10), columns=['a','b'], dtype=int) + self.assert_(df.values.dtype == np.float64) + + def test_constructor_scalar_inference(self): + data = {'int' : 1, 'bool' : True, + 'float' : 3., 'complex': 4j, 'object' : 'foo'} + df = DataFrame(data, index=np.arange(10)) + + self.assert_(df['int'].dtype == np.int64) + self.assert_(df['bool'].dtype == np.bool_) + self.assert_(df['float'].dtype == np.float64) + self.assert_(df['complex'].dtype == np.complex128) + self.assert_(df['object'].dtype == np.object_) + + def test_constructor_arrays_and_scalars(self): + df = DataFrame({'a': randn(10), 'b': True}) + exp = DataFrame({'a': df['a'].values, 'b': [True] * 10}) + + assert_frame_equal(df, exp) + + self.assertRaises(ValueError, DataFrame, {'a': False, 'b': True}) + + def test_constructor_DataFrame(self): + df = DataFrame(self.frame) + assert_frame_equal(df, self.frame) + + df_casted = DataFrame(self.frame, dtype=int) + self.assert_(df_casted.values.dtype == np.int64) + + def test_constructor_more(self): + # used to be in test_matrix.py + arr = randn(10) + dm = DataFrame(arr, columns=['A'], index=np.arange(10)) + self.assertEqual(dm.values.ndim, 2) + + arr = randn(0) + dm = DataFrame(arr) + self.assertEqual(dm.values.ndim, 2) + self.assertEqual(dm.values.ndim, 2) + + # no data specified + dm = DataFrame(columns=['A', 'B'], index=np.arange(10)) + self.assertEqual(dm.values.shape, (10, 2)) + + dm = DataFrame(columns=['A', 'B']) + self.assertEqual(dm.values.shape, (0, 2)) + + dm = DataFrame(index=np.arange(10)) + self.assertEqual(dm.values.shape, (10, 0)) + + # corner, silly + self.assertRaises(Exception, DataFrame, (1, 2, 3)) + + # can't cast + mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1) + self.assertRaises(ValueError, DataFrame, mat, index=[0, 1], + columns=[0], dtype=float) + + dm = DataFrame(DataFrame(self.frame._series)) + tm.assert_frame_equal(dm, self.frame) + + # int cast + dm = DataFrame({'A' : np.ones(10, dtype=int), + 'B' : np.ones(10, dtype=float)}, + index=np.arange(10)) + + self.assertEqual(len(dm.columns), 2) + self.assert_(dm.values.dtype == np.float64) + + def test_constructor_empty_list(self): + df = DataFrame([], index=[]) + expected = DataFrame(index=[]) + assert_frame_equal(df, expected) + + def test_constructor_list_of_lists(self): + # GH #484 + l = [[1, 'a'], [2, 'b']] + df = DataFrame(data=l, columns=["num", "str"]) + self.assert_(com.is_integer_dtype(df['num'])) + self.assert_(df['str'].dtype == np.object_) + + def test_constructor_list_of_dicts(self): + data = [{'a': 1.5, 'b': 3, 'c':4, 'd':6}, + {'a': 1.5, 'b': 3, 'd':6}, + {'a': 1.5, 'd':6}, + {}, + {'a': 1.5, 'b': 3, 'c':4}, + {'b': 3, 'c':4, 'd':6}] + + result = DataFrame(data) + expected = DataFrame.from_dict(dict(zip(range(len(data)), data)), + orient='index') + assert_frame_equal(result, expected.reindex(result.index)) + + result = DataFrame([{}]) + expected = DataFrame(index=[0]) + assert_frame_equal(result, expected) + + def test_constructor_list_of_series(self): + data = [{'a': 1.5, 'b': 3.0, 'c':4.0}, + {'a': 1.5, 'b': 3.0, 'c':6.0}] + sdict = dict(zip(['x', 'y'], data)) + idx = Index(['a', 'b', 'c']) + + # all named + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx, name='y')] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected) + + # some unnamed + data2 = [Series([1.5, 3, 4], idx, dtype='O', name='x'), + Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + + sdict = dict(zip(['x', 'Unnamed 0'], data)) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result.sort_index(), expected) + + # none named + data = [{'a': 1.5, 'b': 3, 'c':4, 'd':6}, + {'a': 1.5, 'b': 3, 'd':6}, + {'a': 1.5, 'd':6}, + {}, + {'a': 1.5, 'b': 3, 'c':4}, + {'b': 3, 'c':4, 'd':6}] + data = [Series(d) for d in data] + + result = DataFrame(data) + sdict = dict(zip(range(len(data)), data)) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected.reindex(result.index)) + + result2 = DataFrame(data, index=np.arange(6)) + assert_frame_equal(result, result2) + + result = DataFrame([Series({})]) + expected = DataFrame(index=[0]) + assert_frame_equal(result, expected) + + data = [{'a': 1.5, 'b': 3.0, 'c':4.0}, + {'a': 1.5, 'b': 3.0, 'c':6.0}] + sdict = dict(zip(range(len(data)), data)) + idx = Index(['a', 'b', 'c']) + data2 = [Series([1.5, 3, 4], idx, dtype='O'), + Series([1.5, 3, 6], idx)] + result = DataFrame(data2) + expected = DataFrame.from_dict(sdict, orient='index') + assert_frame_equal(result, expected) + + def test_constructor_list_of_derived_dicts(self): + class CustomDict(dict): + pass + d = {'a': 1.5, 'b': 3} + + data_custom = [CustomDict(d)] + data = [d] + + result_custom = DataFrame(data_custom) + result = DataFrame(data) + assert_frame_equal(result, result_custom) + + def test_constructor_ragged(self): + data = {'A' : randn(10), + 'B' : randn(8)} + self.assertRaises(Exception, DataFrame, data) + + def test_constructor_scalar(self): + idx = Index(range(3)) + df = DataFrame({"a" : 0}, index=idx) + expected = DataFrame({"a" : [0, 0, 0]}, index=idx) + assert_frame_equal(df, expected) + + def test_constructor_Series_copy_bug(self): + df = DataFrame(self.frame['A'], index=self.frame.index, columns=['A']) + df.copy() + + def test_constructor_mixed_dict_and_Series(self): + data = {} + data['A'] = {'foo' : 1, 'bar' : 2, 'baz' : 3} + data['B'] = Series([4, 3, 2, 1], index=['bar', 'qux', 'baz', 'foo']) + + result = DataFrame(data) + self.assert_(result.index.is_monotonic) + + # ordering ambiguous, raise exception + self.assertRaises(Exception, DataFrame, + {'A' : ['a', 'b'], 'B' : {'a' : 'a', 'b' : 'b'}}) + + # this is OK though + result = DataFrame({'A' : ['a', 'b'], + 'B' : Series(['a', 'b'], index=['a', 'b'])}) + expected = DataFrame({'A' : ['a', 'b'], 'B' : ['a', 'b']}, + index=['a', 'b']) + assert_frame_equal(result, expected) + + def test_constructor_tuples(self): + result = DataFrame({'A': [(1, 2), (3, 4)]}) + expected = DataFrame({'A': Series([(1, 2), (3, 4)])}) + assert_frame_equal(result, expected) + + def test_constructor_orient(self): + data_dict = self.mixed_frame.T._series + recons = DataFrame.from_dict(data_dict, orient='index') + expected = self.mixed_frame.sort_index() + assert_frame_equal(recons, expected) + + def test_constructor_Series_named(self): + a = Series([1,2,3], index=['a','b','c'], name='x') + df = DataFrame(a) + self.assert_(df.columns[0] == 'x') + self.assert_(df.index.equals(a.index)) + + def test_constructor_Series_differently_indexed(self): + # name + s1 = Series([1, 2, 3], index=['a','b','c'], name='x') + + # no name + s2 = Series([1, 2, 3], index=['a','b','c']) + + other_index = Index(['a', 'b']) + + df1 = DataFrame(s1, index=other_index) + exp1 = DataFrame(s1.reindex(other_index)) + self.assert_(df1.columns[0] == 'x') + assert_frame_equal(df1, exp1) + + df2 = DataFrame(s2, index=other_index) + exp2 = DataFrame(s2.reindex(other_index)) + self.assert_(df2.columns[0] == 0) + self.assert_(df2.index.equals(other_index)) + assert_frame_equal(df2, exp2) + + def test_constructor_manager_resize(self): + index = list(self.frame.index[:5]) + columns = list(self.frame.columns[:3]) + + result = DataFrame(self.frame._data, index=index, + columns=columns) + self.assert_(np.array_equal(result.index, index)) + self.assert_(np.array_equal(result.columns, columns)) + + def test_constructor_from_items(self): + items = [(c, self.frame[c]) for c in self.frame.columns] + recons = DataFrame.from_items(items) + assert_frame_equal(recons, self.frame) + + # pass some columns + recons = DataFrame.from_items(items, columns=['C', 'B', 'A']) + assert_frame_equal(recons, self.frame.ix[:, ['C', 'B', 'A']]) + + # orient='index' + + row_items = [(idx, self.mixed_frame.xs(idx)) + for idx in self.mixed_frame.index] + + recons = DataFrame.from_items(row_items, + columns=self.mixed_frame.columns, + orient='index') + assert_frame_equal(recons, self.mixed_frame) + self.assert_(recons['A'].dtype == np.float64) + + self.assertRaises(ValueError, DataFrame.from_items, row_items, + orient='index') + + # orient='index', but thar be tuples + arr = lib.list_to_object_array([('bar', 'baz')] * len(self.mixed_frame)) + self.mixed_frame['foo'] = arr + row_items = [(idx, list(self.mixed_frame.xs(idx))) + for idx in self.mixed_frame.index] + recons = DataFrame.from_items(row_items, + columns=self.mixed_frame.columns, + orient='index') + assert_frame_equal(recons, self.mixed_frame) + self.assert_(isinstance(recons['foo'][0], tuple)) + + def test_constructor_mix_series_nonseries(self): + df = DataFrame({'A' : self.frame['A'], + 'B' : list(self.frame['B'])}, columns=['A', 'B']) + assert_frame_equal(df, self.frame.ix[:, ['A', 'B']]) + + self.assertRaises(Exception, DataFrame, + {'A' : self.frame['A'], + 'B' : list(self.frame['B'])[:-2]}) + + def test_constructor_miscast_na_int_dtype(self): + df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) + expected = DataFrame([[np.nan, 1], [1, 0]]) + assert_frame_equal(df, expected) + + def test_new_empty_index(self): + df1 = DataFrame(randn(0, 3)) + df2 = DataFrame(randn(0, 3)) + df1.index.name = 'foo' + self.assert_(df2.index.name is None) + + def test_astype(self): + casted = self.frame.astype(int) + expected = DataFrame(self.frame.values.astype(int), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(casted, expected) + + self.frame['foo'] = '5' + casted = self.frame.astype(int) + expected = DataFrame(self.frame.values.astype(int), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(casted, expected) + + def test_astype_cast_nan_int(self): + df = DataFrame(data={"Values": [1.0, 2.0, 3.0, np.nan]}) + self.assertRaises(ValueError, df.astype, np.int64) + + def test_array_interface(self): + result = np.sqrt(self.frame) + self.assert_(type(result) is type(self.frame)) + self.assert_(result.index is self.frame.index) + self.assert_(result.columns is self.frame.columns) + + assert_frame_equal(result, self.frame.apply(np.sqrt)) + + def test_pickle(self): + unpickled = pickle.loads(pickle.dumps(self.mixed_frame)) + assert_frame_equal(self.mixed_frame, unpickled) + + # buglet + self.mixed_frame._data.ndim + + # empty + unpickled = pickle.loads(pickle.dumps(self.empty)) + repr(unpickled) + + def test_to_dict(self): + test_data = { + 'A' : {'1' : 1, '2' : 2}, + 'B' : {'1' : '1', '2' : '2', '3' : '3'}, + } + recons_data = DataFrame(test_data).to_dict() + + for k, v in test_data.iteritems(): + for k2, v2 in v.iteritems(): + self.assertEqual(v2, recons_data[k][k2]) + + recons_data = DataFrame(test_data).to_dict("l") + + for k,v in test_data.iteritems(): + for k2, v2 in v.iteritems(): + self.assertEqual(v2, recons_data[k][int(k2) - 1]) + + recons_data = DataFrame(test_data).to_dict("s") + + for k,v in test_data.iteritems(): + for k2, v2 in v.iteritems(): + self.assertEqual(v2, recons_data[k][k2]) + + def test_from_json_to_json(self): + raise nose.SkipTest + + def _check_orient(df, orient, dtype=None, numpy=True): + df = df.sort() + dfjson = df.to_json(orient=orient) + unser = DataFrame.from_json(dfjson, orient=orient, dtype=dtype, + numpy=numpy) + unser = unser.sort() + if df.index.dtype.type == np.datetime64: + unser.index = DatetimeIndex(unser.index.values.astype('i8')) + if orient == "records": + # index is not captured in this orientation + assert_almost_equal(df.values, unser.values) + self.assert_(df.columns.equals(unser.columns)) + elif orient == "values": + # index and cols are not captured in this orientation + assert_almost_equal(df.values, unser.values) + elif orient == "split": + # index and col labels might not be strings + unser.index = [str(i) for i in unser.index] + unser.columns = [str(i) for i in unser.columns] + unser = unser.sort() + assert_almost_equal(df.values, unser.values) + else: + assert_frame_equal(df, unser) + + def _check_all_orients(df, dtype=None): + _check_orient(df, "columns", dtype=dtype) + _check_orient(df, "records", dtype=dtype) + _check_orient(df, "split", dtype=dtype) + _check_orient(df, "index", dtype=dtype) + _check_orient(df, "values", dtype=dtype) + + _check_orient(df, "columns", dtype=dtype, numpy=False) + _check_orient(df, "records", dtype=dtype, numpy=False) + _check_orient(df, "split", dtype=dtype, numpy=False) + _check_orient(df, "index", dtype=dtype, numpy=False) + _check_orient(df, "values", dtype=dtype, numpy=False) + + # basic + _check_all_orients(self.frame) + self.assertEqual(self.frame.to_json(), + self.frame.to_json(orient="columns")) + + _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) + + # big one + # index and columns are strings as all unserialised JSON object keys + # are assumed to be strings + biggie = DataFrame(np.zeros((200, 4)), + columns=[str(i) for i in range(4)], + index=[str(i) for i in range(200)]) + _check_all_orients(biggie) + + # dtypes + _check_all_orients(DataFrame(biggie, dtype=np.float64), + dtype=np.float64) + _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int) + _check_all_orients(DataFrame(biggie, dtype=' other), df.gt(other)) + assert_frame_equal((df < other), df.lt(other)) + assert_frame_equal((df >= other), df.ge(other)) + assert_frame_equal((df <= other), df.le(other)) + + # Unaligned + def _check_unaligned_frame(meth, op, df, other, default=False): + part_o = other.ix[3:, 1:].copy() + rs = meth(df, part_o) + xp = op(df, part_o.reindex(index=df.index, columns=df.columns)) + assert_frame_equal(rs, xp) + + _check_unaligned_frame(DataFrame.eq, operator.eq, df, other) + _check_unaligned_frame(DataFrame.ne, operator.ne, df, other, + default=True) + _check_unaligned_frame(DataFrame.gt, operator.gt, df, other) + _check_unaligned_frame(DataFrame.lt, operator.lt, df, other) + _check_unaligned_frame(DataFrame.ge, operator.ge, df, other) + _check_unaligned_frame(DataFrame.le, operator.le, df, other) + + # Series + def _test_seq(df, idx_ser, col_ser): + idx_eq = df.eq(idx_ser, axis=0) + col_eq = df.eq(col_ser) + idx_ne = df.ne(idx_ser, axis=0) + col_ne = df.ne(col_ser) + assert_frame_equal(col_eq, df == Series(col_ser)) + assert_frame_equal(col_eq, -col_ne) + assert_frame_equal(idx_eq, -idx_ne) + assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) + assert_frame_equal(col_eq, df.eq(list(col_ser))) + assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0)) + assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) + + idx_gt = df.gt(idx_ser, axis=0) + col_gt = df.gt(col_ser) + idx_le = df.le(idx_ser, axis=0) + col_le = df.le(col_ser) + + assert_frame_equal(col_gt, df > Series(col_ser)) + assert_frame_equal(col_gt, -col_le) + assert_frame_equal(idx_gt, -idx_le) + assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) + + idx_ge = df.ge(idx_ser, axis=0) + col_ge = df.ge(col_ser) + idx_lt = df.lt(idx_ser, axis=0) + col_lt = df.lt(col_ser) + assert_frame_equal(col_ge, df >= Series(col_ser)) + assert_frame_equal(col_ge, -col_lt) + assert_frame_equal(idx_ge, -idx_lt) + assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) + + idx_ser = Series(np.random.randn(5)) + col_ser = Series(np.random.randn(3)) + _test_seq(df, idx_ser, col_ser) + + # ndarray + + assert_frame_equal((df == other.values), df.eq(other.values)) + assert_frame_equal((df != other.values), df.ne(other.values)) + assert_frame_equal((df > other.values), df.gt(other.values)) + assert_frame_equal((df < other.values), df.lt(other.values)) + assert_frame_equal((df >= other.values), df.ge(other.values)) + assert_frame_equal((df <= other.values), df.le(other.values)) + + # list/tuple + _test_seq(df, idx_ser.values, col_ser.values) + + # NA + df.ix[0, 0] = np.nan + rs = df.eq(df) + self.assert_(not rs.ix[0, 0]) + rs = df.ne(df) + self.assert_(rs.ix[0, 0]) + rs = df.gt(df) + self.assert_(not rs.ix[0, 0]) + rs = df.lt(df) + self.assert_(not rs.ix[0, 0]) + rs = df.ge(df) + self.assert_(not rs.ix[0, 0]) + rs = df.le(df) + self.assert_(not rs.ix[0, 0]) + + # scalar + assert_frame_equal(df.eq(0), df == 0) + assert_frame_equal(df.ne(0), df != 0) + assert_frame_equal(df.gt(0), df > 0) + assert_frame_equal(df.lt(0), df < 0) + assert_frame_equal(df.ge(0), df >= 0) + assert_frame_equal(df.le(0), df <= 0) + + assert_frame_equal(df.eq(np.nan), df == np.nan) + assert_frame_equal(df.ne(np.nan), df != np.nan) + assert_frame_equal(df.gt(np.nan), df > np.nan) + assert_frame_equal(df.lt(np.nan), df < np.nan) + assert_frame_equal(df.ge(np.nan), df >= np.nan) + assert_frame_equal(df.le(np.nan), df <= np.nan) + + # complex + arr = np.array([np.nan, 1, 6, np.nan]) + arr2 = np.array([2j, np.nan, 7, None]) + df = DataFrame({'a' : arr}) + df2 = DataFrame({'a' : arr2}) + rs = df.gt(df2) + self.assert_(not rs.values.any()) + rs = df.ne(df2) + self.assert_(rs.values.all()) + + arr3 = np.array([2j, np.nan, None]) + df3 = DataFrame({'a' : arr3}) + rs = df3.gt(2j) + self.assert_(not rs.values.any()) + + # corner, dtype=object + df1 = DataFrame({'col' : ['foo', np.nan, 'bar']}) + df2 = DataFrame({'col' : ['foo', datetime.now(), 'bar']}) + result = df1.ne(df2) + exp = DataFrame({'col' : [False, True, False]}) + assert_frame_equal(result, exp) + + def test_arith_flex_series(self): + df = self.simple + + row = df.xs('a') + col = df['two'] + + assert_frame_equal(df.add(row), df + row) + assert_frame_equal(df.add(row, axis=None), df + row) + assert_frame_equal(df.sub(row), df - row) + assert_frame_equal(df.div(row), df / row) + assert_frame_equal(df.mul(row), df * row) + + assert_frame_equal(df.add(col, axis=0), (df.T + col).T) + assert_frame_equal(df.sub(col, axis=0), (df.T - col).T) + assert_frame_equal(df.div(col, axis=0), (df.T / col).T) + assert_frame_equal(df.mul(col, axis=0), (df.T * col).T) + + def test_arith_non_pandas_object(self): + df = self.simple + + val1 = df.xs('a').values + added = DataFrame(df.values + val1, index=df.index, columns=df.columns) + assert_frame_equal(df + val1, added) + + added = DataFrame((df.values.T + val1).T, + index=df.index, columns=df.columns) + assert_frame_equal(df.add(val1, axis=0), added) + + + val2 = list(df['two']) + + added = DataFrame(df.values + val2, index=df.index, columns=df.columns) + assert_frame_equal(df + val2, added) + + added = DataFrame((df.values.T + val2).T, index=df.index, + columns=df.columns) + assert_frame_equal(df.add(val2, axis='index'), added) + + val3 = np.random.rand(*df.shape) + added = DataFrame(df.values + val3, index=df.index, columns=df.columns) + assert_frame_equal(df.add(val3), added) + + def test_combineFrame(self): + frame_copy = self.frame.reindex(self.frame.index[::2]) + + del frame_copy['D'] + frame_copy['C'][:5] = nan + + added = self.frame + frame_copy + tm.assert_dict_equal(added['A'].valid(), + self.frame['A'] * 2, + compare_keys=False) + + self.assert_(np.isnan(added['C'].reindex(frame_copy.index)[:5]).all()) + + # assert(False) + + self.assert_(np.isnan(added['D']).all()) + + self_added = self.frame + self.frame + self.assert_(self_added.index.equals(self.frame.index)) + + added_rev = frame_copy + self.frame + self.assert_(np.isnan(added['D']).all()) + + # corner cases + + # empty + plus_empty = self.frame + self.empty + self.assert_(np.isnan(plus_empty.values).all()) + + empty_plus = self.empty + self.frame + self.assert_(np.isnan(empty_plus.values).all()) + + empty_empty = self.empty + self.empty + self.assertTrue(empty_empty.empty) + + # out of order + reverse = self.frame.reindex(columns=self.frame.columns[::-1]) + + assert_frame_equal(reverse + self.frame, self.frame * 2) + + def test_combineSeries(self): + + # Series + series = self.frame.xs(self.frame.index[0]) + + added = self.frame + series + + for key, s in added.iteritems(): + assert_series_equal(s, self.frame[key] + series[key]) + + larger_series = series.to_dict() + larger_series['E'] = 1 + larger_series = Series(larger_series) + larger_added = self.frame + larger_series + + for key, s in self.frame.iteritems(): + assert_series_equal(larger_added[key], s + series[key]) + self.assert_('E' in larger_added) + self.assert_(np.isnan(larger_added['E']).all()) + + # TimeSeries + ts = self.tsframe['A'] + added = self.tsframe + ts + + for key, col in self.tsframe.iteritems(): + assert_series_equal(added[key], col + ts) + + smaller_frame = self.tsframe[:-5] + smaller_added = smaller_frame + ts + + self.assert_(smaller_added.index.equals(self.tsframe.index)) + + smaller_ts = ts[:-5] + smaller_added2 = self.tsframe + smaller_ts + assert_frame_equal(smaller_added, smaller_added2) + + # length 0 + result = self.tsframe + ts[:0] + + # Frame is length 0 + result = self.tsframe[:0] + ts + self.assertEqual(len(result), 0) + + # empty but with non-empty index + frame = self.tsframe[:1].reindex(columns=[]) + result = frame * ts + self.assertEqual(len(result), len(ts)) + + def test_combineFunc(self): + result = self.frame * 2 + self.assert_(np.array_equal(result.values, self.frame.values * 2)) + + result = self.empty * 2 + self.assert_(result.index is self.empty.index) + self.assertEqual(len(result.columns), 0) + + def test_comparisons(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + + row = self.simple.xs('a') + + def test_comp(func): + result = func(df1, df2) + self.assert_(np.array_equal(result.values, + func(df1.values, df2.values))) + + result2 = func(self.simple, row) + self.assert_(np.array_equal(result2.values, + func(self.simple.values, row.values))) + + result3 = func(self.frame, 0) + self.assert_(np.array_equal(result3.values, + func(self.frame.values, 0))) + + self.assertRaises(Exception, func, self.simple, self.simple[:2]) + + test_comp(operator.eq) + test_comp(operator.ne) + test_comp(operator.lt) + test_comp(operator.gt) + test_comp(operator.ge) + test_comp(operator.le) + + def test_string_comparison(self): + df = DataFrame([{ "a" : 1, "b" : "foo" }, {"a" : 2, "b" : "bar"}]) + mask_a = df.a > 1 + assert_frame_equal(df[mask_a], df.ix[1:1,:]) + assert_frame_equal(df[-mask_a], df.ix[0:0,:]) + + mask_b = df.b == "foo" + assert_frame_equal(df[mask_b], df.ix[0:0,:]) + assert_frame_equal(df[-mask_b], df.ix[1:1,:]) + + def test_float_none_comparison(self): + df = DataFrame(np.random.randn(8, 3), index=range(8), + columns=['A', 'B', 'C']) + + self.assertRaises(TypeError, df.__eq__, None) + + def test_to_csv_from_csv(self): + path = '__tmp__' + + self.frame['A'][:5] = nan + + self.frame.to_csv(path) + self.frame.to_csv(path, cols=['A', 'B']) + self.frame.to_csv(path, header=False) + self.frame.to_csv(path, index=False) + + # test roundtrip + + self.tsframe.to_csv(path) + recons = DataFrame.from_csv(path) + + assert_frame_equal(self.tsframe, recons) + + self.tsframe.to_csv(path, index_label='index') + recons = DataFrame.from_csv(path, index_col=None) + assert(len(recons.columns) == len(self.tsframe.columns) + 1) + + # no index + self.tsframe.to_csv(path, index=False) + recons = DataFrame.from_csv(path, index_col=None) + assert_almost_equal(self.tsframe.values, recons.values) + + # corner case + dm = DataFrame({'s1' : Series(range(3),range(3)), + 's2' : Series(range(2),range(2))}) + dm.to_csv(path) + recons = DataFrame.from_csv(path) + assert_frame_equal(dm, recons) + + + + #duplicate index + df = DataFrame(np.random.randn(3, 3), index=['a', 'a', 'b'], + columns=['x', 'y', 'z']) + df.to_csv(path) + result = DataFrame.from_csv(path) + assert_frame_equal(result, df) + + midx = MultiIndex.from_tuples([('A', 1, 2), ('A', 1, 2), ('B', 1, 2)]) + df = DataFrame(np.random.randn(3, 3), index=midx, + columns=['x', 'y', 'z']) + df.to_csv(path) + result = DataFrame.from_csv(path, index_col=[0, 1, 2], + parse_dates=False) + assert_frame_equal(result, df) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_csv(path, header=col_aliases) + rs = DataFrame.from_csv(path) + xp = self.frame2.copy() + xp.columns = col_aliases + + assert_frame_equal(xp, rs) + + self.assertRaises(ValueError, self.frame2.to_csv, path, + header=['AA', 'X']) + + os.remove(path) + + def test_to_csv_multiindex(self): + path = '__tmp__' + + frame = self.frame + old_index = frame.index + arrays = np.arange(len(old_index)*2).reshape(2,-1) + new_index = MultiIndex.from_arrays(arrays, names=['first', 'second']) + frame.index = new_index + frame.to_csv(path, header=False) + frame.to_csv(path, cols=['A', 'B']) + + # round trip + frame.to_csv(path) + df = DataFrame.from_csv(path, index_col=[0,1], parse_dates=False) + + assert_frame_equal(frame, df) + self.assertEqual(frame.index.names, df.index.names) + self.frame.index = old_index # needed if setUP becomes a classmethod + + # try multiindex with dates + tsframe = self.tsframe + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_csv(path, index_label = ['time','foo']) + recons = DataFrame.from_csv(path, index_col=[0,1]) + assert_frame_equal(tsframe, recons) + + # do not load index + tsframe.to_csv(path) + recons = DataFrame.from_csv(path, index_col=None) + np.testing.assert_equal(len(recons.columns), len(tsframe.columns) + 2) + + # no index + tsframe.to_csv(path, index=False) + recons = DataFrame.from_csv(path, index_col=None) + assert_almost_equal(recons.values, self.tsframe.values) + self.tsframe.index = old_index # needed if setUP becomes classmethod + + os.remove(path) + + # empty + tsframe[:0].to_csv(path) + recons = DataFrame.from_csv(path) + exp = tsframe[:0] + exp.index = [] + assert_frame_equal(recons, exp) + + def test_to_csv_float32_nanrep(self): + df = DataFrame(np.random.randn(1, 4).astype(np.float32)) + df[1] = np.nan + + pth = '__tmp__.csv' + df.to_csv(pth, na_rep=999) + + lines = open(pth).readlines() + self.assert_(lines[1].split(',')[2] == '999') + os.remove(pth) + + def test_to_csv_withcommas(self): + + path = '__tmp__' + # Commas inside fields should be correctly escaped when saving as CSV. + + df = DataFrame({'A':[1,2,3], 'B':['5,6','7,8','9,0']}) + df.to_csv(path) + df2 = DataFrame.from_csv(path) + assert_frame_equal(df2, df) + + os.remove(path) + + def test_to_csv_bug(self): + path = '__tmp__.csv' + f1 = StringIO('a,1.0\nb,2.0') + df = DataFrame.from_csv(f1,header=None) + newdf = DataFrame({'t': df[df.columns[0]]}) + newdf.to_csv(path) + + recons = pan.read_csv(path, index_col=0) + assert_frame_equal(recons, newdf) + + os.remove(path) + + def test_to_csv_unicode(self): + path = '__tmp__.csv' + df = DataFrame({u'c/\u03c3':[1,2,3]}) + df.to_csv(path, encoding='UTF-8') + df2 = pan.read_csv(path, index_col=0, encoding='UTF-8') + assert_frame_equal(df, df2) + + df.to_csv(path, encoding='UTF-8', index=False) + df2 = pan.read_csv(path, index_col=None, encoding='UTF-8') + assert_frame_equal(df, df2) + + os.remove(path) + + def test_to_csv_stringio(self): + buf = StringIO() + self.frame.to_csv(buf) + buf.seek(0) + recons = pan.read_csv(buf, index_col=0) + assert_frame_equal(recons, self.frame) + + def test_to_excel_from_excel(self): + try: + import xlwt + import xlrd + import openpyxl + except ImportError: + raise nose.SkipTest + + for ext in ['xls', 'xlsx']: + path = '__tmp__.' + ext + + self.frame['A'][:5] = nan + + self.frame.to_excel(path,'test1') + self.frame.to_excel(path,'test1', cols=['A', 'B']) + self.frame.to_excel(path,'test1', header=False) + self.frame.to_excel(path,'test1', index=False) + + # test roundtrip + self.frame.to_excel(path,'test1') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0) + assert_frame_equal(self.frame, recons) + + self.frame.to_excel(path,'test1', index=False) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=None) + recons.index = self.frame.index + assert_frame_equal(self.frame, recons) + + self.frame.to_excel(path,'test1') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0, skiprows=[1]) + assert_frame_equal(self.frame.ix[1:], recons) + + self.frame.to_excel(path,'test1',na_rep='NA') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0, na_values=['NA']) + assert_frame_equal(self.frame, recons) + + self.mixed_frame.to_excel(path,'test1') + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=0) + assert_frame_equal(self.mixed_frame, recons) + + self.tsframe.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + assert_frame_equal(self.tsframe, recons) + + #Test np.int64, values read come back as float + frame = DataFrame(np.random.randint(-10,10,size=(10,2))) + frame.to_excel(path,'test1') + reader = ExcelFile(path) + recons = reader.parse('test1').astype(np.int64) + assert_frame_equal(frame, recons) + + #Test reading/writing np.bool8, roundtrip only works for xlsx + frame = (DataFrame(np.random.randn(10,2)) >= 0) + frame.to_excel(path,'test1') + reader = ExcelFile(path) + recons = reader.parse('test1').astype(np.bool8) + assert_frame_equal(frame, recons) + + # Test writing to separate sheets + writer = ExcelWriter(path) + self.frame.to_excel(writer,'test1') + self.tsframe.to_excel(writer,'test2') + writer.save() + reader = ExcelFile(path) + recons = reader.parse('test1',index_col=0) + assert_frame_equal(self.frame, recons) + recons = reader.parse('test2',index_col=0) + assert_frame_equal(self.tsframe, recons) + np.testing.assert_equal(2, len(reader.sheet_names)) + np.testing.assert_equal('test1', reader.sheet_names[0]) + np.testing.assert_equal('test2', reader.sheet_names[1]) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(path, 'test1', header=col_aliases) + reader = ExcelFile(path) + rs = reader.parse('test1', index_col=0) + xp = self.frame2.copy() + xp.columns = col_aliases + assert_frame_equal(xp, rs) + + os.remove(path) + + # datetime.date, not sure what to test here exactly + path = '__tmp__.xls' + tsf = self.tsframe.copy() + tsf.index = [x.date() for x in self.tsframe.index] + tsf.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + assert_frame_equal(self.tsframe, recons) + os.remove(path) + + #Test roundtrip np.bool8, does not seem to work for xls + path = '__tmp__.xlsx' + frame = (DataFrame(np.random.randn(10,2)) >= 0) + frame.to_excel(path,'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + assert_frame_equal(frame, recons) + os.remove(path) + + + def test_to_excel_multiindex(self): + try: + import xlwt + import xlrd + import openpyxl + except ImportError: + raise nose.SkipTest + + for ext in ['xls', 'xlsx']: + path = '__tmp__.' + ext + + frame = self.frame + old_index = frame.index + arrays = np.arange(len(old_index)*2).reshape(2,-1) + new_index = MultiIndex.from_arrays(arrays, + names=['first', 'second']) + frame.index = new_index + frame.to_excel(path, 'test1', header=False) + frame.to_excel(path, 'test1', cols=['A', 'B']) + + # round trip + frame.to_excel(path, 'test1') + reader = ExcelFile(path) + df = reader.parse('test1', index_col=[0,1], parse_dates=False) + assert_frame_equal(frame, df) + self.assertEqual(frame.index.names, df.index.names) + self.frame.index = old_index # needed if setUP becomes a classmethod + + # try multiindex with dates + tsframe = self.tsframe + old_index = tsframe.index + new_index = [old_index, np.arange(len(old_index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.to_excel(path, 'test1', index_label = ['time','foo']) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=[0,1]) + assert_frame_equal(tsframe, recons) + + # infer index + tsframe.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + assert_frame_equal(tsframe, recons) + + # no index + tsframe.index.names = ['first', 'second'] + tsframe.to_excel(path, 'test1') + reader = ExcelFile(path) + recons = reader.parse('test1') + assert_almost_equal(tsframe.values, + recons.ix[:, tsframe.columns].values) + self.assertEqual(len(tsframe.columns) + 2, len(recons.columns)) + + tsframe.index.names = [None, None] + + # no index + tsframe.to_excel(path, 'test1', index=False) + reader = ExcelFile(path) + recons = reader.parse('test1', index_col=None) + assert_almost_equal(recons.values, self.tsframe.values) + self.tsframe.index = old_index # needed if setUP becomes classmethod + + # write a big DataFrame + df = DataFrame(np.random.randn(1005, 1)) + df.to_excel(path, 'test1') + + os.remove(path) + + def test_info(self): + io = StringIO() + self.frame.info(buf=io) + self.tsframe.info(buf=io) + + frame = DataFrame(np.random.randn(5, 3)) + + import sys + sys.stdout = StringIO() + frame.info() + frame.info(verbose=False) + sys.stdout = sys.__stdout__ + + def test_dtypes(self): + self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 + result = self.mixed_frame.dtypes + expected = Series(dict((k, v.dtype) + for k, v in self.mixed_frame.iteritems()), + index=result.index) + assert_series_equal(result, expected) + + def test_convert_objects(self): + oops = self.mixed_frame.T.T + converted = oops.convert_objects() + assert_frame_equal(converted, self.mixed_frame) + self.assert_(converted['A'].dtype == np.float64) + + def test_convert_objects_no_conversion(self): + mixed1 = DataFrame({'a': [1,2,3], 'b': [4.0, 5, 6], 'c': ['x','y','z']}) + mixed2 = mixed1.convert_objects() + assert_frame_equal(mixed1, mixed2) + + def test_append_series_dict(self): + df = DataFrame(np.random.randn(5, 4), + columns=['foo', 'bar', 'baz', 'qux']) + + series = df.ix[4] + self.assertRaises(Exception, df.append, series, verify_integrity=True) + series.name = None + self.assertRaises(Exception, df.append, series, verify_integrity=True) + + result = df.append(series[::-1], ignore_index=True) + expected = df.append(DataFrame({0 : series[::-1]}, index=df.columns).T, + ignore_index=True) + assert_frame_equal(result, expected) + + # dict + result = df.append(series.to_dict(), ignore_index=True) + assert_frame_equal(result, expected) + + result = df.append(series[::-1][:3], ignore_index=True) + expected = df.append(DataFrame({0 : series[::-1][:3]}).T, + ignore_index=True) + assert_frame_equal(result, expected.ix[:, result.columns]) + + # can append when name set + row = df.ix[4] + row.name = 5 + result = df.append(row) + expected = df.append(df[-1:], ignore_index=True) + assert_frame_equal(result, expected) + + def test_append_list_of_series_dicts(self): + df = DataFrame(np.random.randn(5, 4), + columns=['foo', 'bar', 'baz', 'qux']) + + dicts = [x.to_dict() for idx, x in df.iterrows()] + + result = df.append(dicts, ignore_index=True) + expected = df.append(df, ignore_index=True) + assert_frame_equal(result, expected) + + # different columns + dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4}, + {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}] + result = df.append(dicts, ignore_index=True) + expected = df.append(DataFrame(dicts), ignore_index=True) + assert_frame_equal(result, expected) + + def test_asfreq(self): + offset_monthly = self.tsframe.asfreq(datetools.bmonthEnd) + rule_monthly = self.tsframe.asfreq('BM') + + assert_almost_equal(offset_monthly['A'], rule_monthly['A']) + + filled = rule_monthly.asfreq('B', method='pad') + # TODO: actually check that this worked. + + # don't forget! + filled_dep = rule_monthly.asfreq('B', method='pad') + + # test does not blow up on length-0 DataFrame + zero_length = self.tsframe.reindex([]) + result = zero_length.asfreq('BM') + self.assert_(result is not zero_length) + + def test_asfreq_datetimeindex(self): + from pandas import DatetimeIndex + df = DataFrame({'A': [1,2,3]}, + index=[datetime(2011,11,01), datetime(2011,11,2), + datetime(2011,11,3)]) + df = df.asfreq('B') + self.assert_(isinstance(df.index, DatetimeIndex)) + + ts = df['A'].asfreq('B') + self.assert_(isinstance(ts.index, DatetimeIndex)) + + def test_as_matrix(self): + frame = self.frame + mat = frame.as_matrix() + + frameCols = frame.columns + for i, row in enumerate(mat): + for j, value in enumerate(row): + col = frameCols[j] + if np.isnan(value): + self.assert_(np.isnan(frame[col][i])) + else: + self.assertEqual(value, frame[col][i]) + + # mixed type + mat = self.mixed_frame.as_matrix(['foo', 'A']) + self.assertEqual(mat[0, 0], 'bar') + + df = DataFrame({'real' : [1,2,3], 'complex' : [1j, 2j, 3j]}) + mat = df.as_matrix() + self.assertEqual(mat[0, 0], 1j) + + # single block corner case + mat = self.frame.as_matrix(['A', 'B']) + expected = self.frame.reindex(columns=['A', 'B']).values + assert_almost_equal(mat, expected) + + def test_values(self): + self.frame.values[:, 0] = 5. + self.assert_((self.frame.values[:, 0] == 5).all()) + + def test_deepcopy(self): + cp = deepcopy(self.frame) + series = cp['A'] + series[:] = 10 + for idx, value in series.iteritems(): + self.assertNotEqual(self.frame['A'][idx], value) + + def test_copy(self): + cop = self.frame.copy() + cop['E'] = cop['A'] + self.assert_('E' not in self.frame) + + # copy objects + copy = self.mixed_frame.copy() + self.assert_(copy._data is not self.mixed_frame._data) + + # def test_copy_index_name_checking(self): + # # don't want to be able to modify the index stored elsewhere after + # # making a copy + + # self.frame.columns.name = None + # cp = self.frame.copy() + # cp.columns.name = 'foo' + + # self.assert_(self.frame.columns.name is None) + + def test_corr(self): + _skip_if_no_scipy() + self.frame['A'][:5] = nan + self.frame['B'][:10] = nan + + def _check_method(method='pearson'): + correls = self.frame.corr(method=method) + exp = self.frame['A'].corr(self.frame['C'], method=method) + assert_almost_equal(correls['A']['C'], exp) + + _check_method('pearson') + _check_method('kendall') + _check_method('spearman') + + # exclude non-numeric types + result = self.mixed_frame.corr() + expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].corr() + assert_frame_equal(result, expected) + + def test_cov(self): + self.frame['A'][:5] = nan + self.frame['B'][:10] = nan + cov = self.frame.cov() + + assert_almost_equal(cov['A']['C'], + self.frame['A'].cov(self.frame['C'])) + + # exclude non-numeric types + result = self.mixed_frame.cov() + expected = self.mixed_frame.ix[:, ['A', 'B', 'C', 'D']].cov() + assert_frame_equal(result, expected) + + def test_corrwith(self): + a = self.tsframe + noise = Series(randn(len(a)), index=a.index) + + b = self.tsframe + noise + + # make sure order does not matter + b = b.reindex(columns=b.columns[::-1], index=b.index[::-1][10:]) + del b['B'] + + colcorr = a.corrwith(b, axis=0) + assert_almost_equal(colcorr['A'], a['A'].corr(b['A'])) + + rowcorr = a.corrwith(b, axis=1) + assert_series_equal(rowcorr, a.T.corrwith(b.T, axis=0)) + + dropped = a.corrwith(b, axis=0, drop=True) + assert_almost_equal(dropped['A'], a['A'].corr(b['A'])) + self.assert_('B' not in dropped) + + dropped = a.corrwith(b, axis=1, drop=True) + self.assert_(a.index[-1] not in dropped.index) + + # non time-series data + index = ['a', 'b', 'c', 'd', 'e'] + columns = ['one', 'two', 'three', 'four'] + df1 = DataFrame(randn(5, 4), index=index, columns=columns) + df2 = DataFrame(randn(4, 4), index=index[:4], columns=columns) + correls = df1.corrwith(df2, axis=1) + for row in index[:4]: + assert_almost_equal(correls[row], df1.ix[row].corr(df2.ix[row])) + + def test_corrwith_with_objects(self): + df1 = tm.makeTimeDataFrame() + df2 = tm.makeTimeDataFrame() + cols = ['A', 'B', 'C', 'D'] + + df1['obj'] = 'foo' + df2['obj'] = 'bar' + + result = df1.corrwith(df2) + expected = df1.ix[:, cols].corrwith(df2.ix[:, cols]) + assert_series_equal(result, expected) + + result = df1.corrwith(df2, axis=1) + expected = df1.ix[:, cols].corrwith(df2.ix[:, cols], axis=1) + assert_series_equal(result, expected) + + def test_corrwith_series(self): + result = self.tsframe.corrwith(self.tsframe['A']) + expected = self.tsframe.apply(self.tsframe['A'].corr) + + assert_series_equal(result, expected) + + def test_dropEmptyRows(self): + N = len(self.frame.index) + mat = randn(N) + mat[:5] = nan + + frame = DataFrame({'foo' : mat}, index=self.frame.index) + + smaller_frame = frame.dropna(how='all') + self.assert_(np.array_equal(smaller_frame['foo'], mat[5:])) + + smaller_frame = frame.dropna(how='all', subset=['foo']) + self.assert_(np.array_equal(smaller_frame['foo'], mat[5:])) + + def test_dropIncompleteRows(self): + N = len(self.frame.index) + mat = randn(N) + mat[:5] = nan + + frame = DataFrame({'foo' : mat}, index=self.frame.index) + frame['bar'] = 5 + + smaller_frame = frame.dropna() + self.assert_(np.array_equal(smaller_frame['foo'], mat[5:])) + + samesize_frame = frame.dropna(subset=['bar']) + self.assert_(samesize_frame.index.equals(self.frame.index)) + + def test_dropna(self): + df = DataFrame(np.random.randn(6, 4)) + df[2][:2] = nan + + dropped = df.dropna(axis=1) + expected = df.ix[:, [0, 1, 3]] + assert_frame_equal(dropped, expected) + + dropped = df.dropna(axis=0) + expected = df.ix[range(2, 6)] + assert_frame_equal(dropped, expected) + + # threshold + dropped = df.dropna(axis=1, thresh=5) + expected = df.ix[:, [0, 1, 3]] + assert_frame_equal(dropped, expected) + + dropped = df.dropna(axis=0, thresh=4) + expected = df.ix[range(2, 6)] + assert_frame_equal(dropped, expected) + + dropped = df.dropna(axis=1, thresh=4) + assert_frame_equal(dropped, df) + + dropped = df.dropna(axis=1, thresh=3) + assert_frame_equal(dropped, df) + + # subset + dropped = df.dropna(axis=0, subset=[0, 1, 3]) + assert_frame_equal(dropped, df) + + # all + dropped = df.dropna(axis=1, how='all') + assert_frame_equal(dropped, df) + + df[2] = nan + dropped = df.dropna(axis=1, how='all') + expected = df.ix[:, [0, 1, 3]] + assert_frame_equal(dropped, expected) + + def test_dropna_corner(self): + # bad input + self.assertRaises(ValueError, self.frame.dropna, how='foo') + self.assertRaises(ValueError, self.frame.dropna, how=None) + + def test_drop_duplicates(self): + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1, 1, 2, 2, 2, 2, 1, 2], + 'D' : range(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', take_last=True) + expected = df.ix[[6, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.ix[[0, 1, 2, 3]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], take_last=True) + expected = df.ix[[0, 5, 6, 7]] + assert_frame_equal(result, expected) + + # consider everything + df2 = df.ix[:, ['A', 'B', 'C']] + + result = df2.drop_duplicates() + # in this case only + expected = df2.drop_duplicates(['A', 'B']) + assert_frame_equal(result, expected) + + result = df2.drop_duplicates(take_last=True) + expected = df2.drop_duplicates(['A', 'B'], take_last=True) + assert_frame_equal(result, expected) + + def test_drop_duplicates_NA(self): + # none + df = DataFrame({'A' : [None, None, 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D' : range(8)}) + + # single column + result = df.drop_duplicates('A') + expected = df.ix[[0, 2, 3]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('A', take_last=True) + expected = df.ix[[1, 6, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['A', 'B']) + expected = df.ix[[0, 2, 3, 6]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['A', 'B'], take_last=True) + expected = df.ix[[1, 5, 6, 7]] + assert_frame_equal(result, expected) + + # nan + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1.0, np.nan, np.nan, np.nan, 1., 1., 1, 1.], + 'D' : range(8)}) + + # single column + result = df.drop_duplicates('C') + expected = df[:2] + assert_frame_equal(result, expected) + + result = df.drop_duplicates('C', take_last=True) + expected = df.ix[[3, 7]] + assert_frame_equal(result, expected) + + # multi column + result = df.drop_duplicates(['C', 'B']) + expected = df.ix[[0, 1, 2, 4]] + assert_frame_equal(result, expected) + + result = df.drop_duplicates(['C', 'B'], take_last=True) + expected = df.ix[[1, 3, 6, 7]] + assert_frame_equal(result, expected) + + def test_drop_duplicates_inplace(self): + orig = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'bar', 'foo'], + 'B' : ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C' : [1, 1, 2, 2, 2, 2, 1, 2], + 'D' : range(8)}) + + # single column + df = orig.copy() + df.drop_duplicates('A', inplace=True) + expected = orig[:2] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates('A', take_last=True, inplace=True) + expected = orig.ix[[6, 7]] + result = df + assert_frame_equal(result, expected) + + # multi column + df = orig.copy() + df.drop_duplicates(['A', 'B'], inplace=True) + expected = orig.ix[[0, 1, 2, 3]] + result = df + assert_frame_equal(result, expected) + + df = orig.copy() + df.drop_duplicates(['A', 'B'], take_last=True, inplace=True) + expected = orig.ix[[0, 5, 6, 7]] + result = df + assert_frame_equal(result, expected) + + # consider everything + orig2 = orig.ix[:, ['A', 'B', 'C']].copy() + + df2 = orig2.copy() + df2.drop_duplicates(inplace=True) + # in this case only + expected = orig2.drop_duplicates(['A', 'B']) + result = df2 + assert_frame_equal(result, expected) + + df2 = orig2.copy() + df2.drop_duplicates(take_last=True, inplace=True) + expected = orig2.drop_duplicates(['A', 'B'], take_last=True) + result = df2 + assert_frame_equal(result, expected) + + def test_drop_col_still_multiindex(self): + arrays = [[ 'a', 'b', 'c', 'top'], + [ '', '', '', 'OD' ], + [ '', '', '', 'wx' ]] + + tuples = zip(*arrays) + tuples.sort() + index = MultiIndex.from_tuples(tuples) + + df = DataFrame(randn(3,4), columns=index) + del df[('a','','')] + assert(isinstance(df.columns, MultiIndex)) + + def test_fillna(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + zero_filled = self.tsframe.fillna(0) + self.assert_((zero_filled['A'][:5] == 0).all()) + + padded = self.tsframe.fillna(method='pad') + self.assert_(np.isnan(padded['A'][:5]).all()) + self.assert_((padded['A'][-5:] == padded['A'][-5]).all()) + + # mixed type + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + result = self.mixed_frame.fillna(value=0) + + def test_fillna_skip_certain_blocks(self): + # don't try to fill boolean, int blocks + + df = DataFrame(np.random.randn(10, 4).astype(int)) + + # it works! + df.fillna(np.nan) + + def test_fillna_inplace(self): + df = DataFrame(np.random.randn(10, 4)) + df[1][:4] = np.nan + df[3][-4:] = np.nan + + expected = df.fillna(value=0) + self.assert_(expected is not df) + + df2 = df.fillna(value=0, inplace=True) + self.assert_(df2 is df) + assert_frame_equal(df2, expected) + + df[1][:4] = np.nan + df[3][-4:] = np.nan + expected = df.fillna() + self.assert_(expected is not df) + + df2 = df.fillna(inplace=True) + self.assert_(df2 is df) + assert_frame_equal(df2, expected) + + def test_fillna_dict_series(self): + df = DataFrame({'a': [nan, 1, 2, nan, nan], + 'b': [1, 2, 3, nan, nan], + 'c': [nan, 1, 2, 3, 4]}) + + result = df.fillna({'a': 0, 'b': 5}) + + expected = df.copy() + expected['a'] = expected['a'].fillna(0) + expected['b'] = expected['b'].fillna(5) + assert_frame_equal(result, expected) + + # it works + result = df.fillna({'a': 0, 'b': 5, 'd' : 7}) + + # Series treated same as dict + result = df.fillna(df.max()) + expected = df.fillna(df.max().to_dict()) + assert_frame_equal(result, expected) + + # disable this for now + self.assertRaises(Exception, df.fillna, df.max(1), axis=1) + + def test_fillna_columns(self): + df = DataFrame(np.random.randn(10, 10)) + df.values[:, ::2] = np.nan + + result = df.fillna(axis=1) + expected = df.T.fillna(method='pad').T + assert_frame_equal(result, expected) + + df.insert(6, 'foo', 5) + result = df.fillna(axis=1) + expected = df.astype(float).fillna(axis=1) + assert_frame_equal(result, expected) + + def test_fillna_invalid_method(self): + try: + self.frame.fillna(method='ffil') + except ValueError, inst: + self.assert_('ffil' in str(inst)) + + def test_replace_inplace(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + tsframe = self.tsframe.copy() + tsframe.replace(nan, 0, inplace=True) + assert_frame_equal(tsframe, self.tsframe.fillna(0)) + + tsframe = self.tsframe.copy() + tsframe.replace(nan, method='pad', inplace=True) + assert_frame_equal(tsframe, self.tsframe.fillna(method='pad')) + + # mixed type + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + result = self.mixed_frame.replace(np.nan, 0) + expected = self.mixed_frame.fillna(value=0) + assert_frame_equal(result, expected) + + tsframe = self.tsframe.copy() + tsframe.replace([nan], [0], inplace=True) + assert_frame_equal(tsframe, self.tsframe.fillna(0)) + + def test_replace(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + zero_filled = self.tsframe.replace(nan, -1e8) + assert_frame_equal(zero_filled, self.tsframe.fillna(-1e8)) + assert_frame_equal(zero_filled.replace(-1e8, nan), self.tsframe) + + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + self.tsframe['B'][:5] = -1e8 + + # empty + df = DataFrame(index=['a', 'b']) + assert_frame_equal(df, df.replace(5, 7)) + + def test_replace_mixed(self): + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + result = self.mixed_frame.replace(np.nan, -1e8) + expected = self.mixed_frame.fillna(value=-1e8) + assert_frame_equal(result, expected) + assert_frame_equal(result.replace(-1e8, nan), self.mixed_frame) + + def test_replace_interpolate(self): + padded = self.tsframe.replace(nan, method='pad') + assert_frame_equal(padded, self.tsframe.fillna(method='pad')) + + result = self.tsframe.replace(to_replace={'A' : nan}, method='pad', + axis=1) + expected = self.tsframe.T.replace(to_replace={'A' : nan}, method='pad').T + assert_frame_equal(result, expected) + + result = self.tsframe.replace(to_replace={'A' : nan, 'B' : -1e8}, + method='bfill') + tsframe = self.tsframe.copy() + b = tsframe['B'] + b[b == -1e8] = nan + tsframe['B'] = b + expected = tsframe.fillna(method='bfill') + assert_frame_equal(expected, result) + + bfilled = self.tsframe.replace(nan, method='bfill') + assert_frame_equal(bfilled, self.tsframe.fillna(method='bfill')) + + frame = self.tsframe.copy() + frame[frame == 0] = 1 + frame.ix[-5:, 2] = 0 + result = frame.replace([nan, 0], method='pad') + + expected = frame.copy() + expected[expected == 0] = nan + expected = expected.fillna(method='pad') + assert_frame_equal(result, expected) + + result = self.mixed_frame.replace(nan, method='pad', axis=1) + expected = self.mixed_frame.fillna(method='pad', axis=1) + assert_frame_equal(result, expected) + + # no nans + self.tsframe['A'][:5] = 1e8 + result = self.tsframe.replace(1e8, method='bfill') + self.tsframe['A'].replace(1e8, nan, inplace=True) + expected = self.tsframe.fillna(method='bfill') + assert_frame_equal(result, expected) + + def test_replace_dtypes(self): + # int + df = DataFrame({'ints' : [1,2,3]}) + result = df.replace(1, 0) + expected = DataFrame({'ints' : [0,2,3]}) + assert_frame_equal(result, expected) + + # bools + df = DataFrame({'bools': [True, False, True]}) + result = df.replace(False, True) + self.assert_(result.values.all()) + + #complex blocks + df = DataFrame({'complex': [1j, 2j, 3j]}) + result = df.replace(1j, 0j) + expected = DataFrame({'complex': [0j, 2j, 3j]}) + assert_frame_equal(result, expected) + + # datetime blocks + prev = datetime.today() + now = datetime.today() + df = DataFrame({'datetime64' : Index([prev, now, prev])}) + result = df.replace(prev, now) + expected = DataFrame({'datetime64' : Index([now] * 3)}) + assert_frame_equal(result, expected) + + def test_replace_input_formats(self): + # both dicts + to_rep = {'A' : np.nan, 'B' : 0, 'C' : ''} + values = {'A' : 0, 'B' : -1, 'C' : 'missing'} + df = DataFrame({'A' : [np.nan, 0, np.inf], 'B' : [0, 2, 5], + 'C' : ['', 'asdf', 'fd']}) + filled = df.replace(to_rep, values) + expected = {} + for k, v in df.iteritems(): + expected[k] = v.replace(to_rep[k], values[k]) + assert_frame_equal(filled, DataFrame(expected)) + + result = df.replace([0, 2, 5], [5, 2, 0]) + expected = DataFrame({'A' : [np.nan, 5, np.inf], 'B' : [5, 2, 0], + 'C' : ['', 'asdf', 'fd']}) + assert_frame_equal(result, expected) + + # dict to scalar + filled = df.replace(to_rep, 0) + expected = {} + for k, v in df.iteritems(): + expected[k] = v.replace(to_rep[k], 0) + assert_frame_equal(filled, DataFrame(expected)) + + self.assertRaises(ValueError, df.replace, to_rep, [np.nan, 0, '']) + + # scalar to dict + values = {'A' : 0, 'B' : -1, 'C' : 'missing'} + df = DataFrame({'A' : [np.nan, 0, np.nan], 'B' : [0, 2, 5], + 'C' : ['', 'asdf', 'fd']}) + filled = df.replace(np.nan, values) + expected = {} + for k, v in df.iteritems(): + expected[k] = v.replace(np.nan, values[k]) + assert_frame_equal(filled, DataFrame(expected)) + + # list to list + to_rep = [np.nan, 0, ''] + values = [-2, -1, 'missing'] + result = df.replace(to_rep, values) + expected = df.copy() + for i in range(len(to_rep)): + expected.replace(to_rep[i], values[i], inplace=True) + assert_frame_equal(result, expected) + + self.assertRaises(ValueError, df.replace, to_rep, values[1:]) + + # list to scalar + to_rep = [np.nan, 0, ''] + result = df.replace(to_rep, -1) + expected = df.copy() + for i in range(len(to_rep)): + expected.replace(to_rep[i], -1, inplace=True) + assert_frame_equal(result, expected) + + def test_replace_axis(self): + self.tsframe['A'][:5] = nan + self.tsframe['A'][-5:] = nan + + zero_filled = self.tsframe.replace(nan, 0, axis=1) + assert_frame_equal(zero_filled, self.tsframe.fillna(0, axis=1)) + + padded = self.tsframe.replace(nan, method='pad', axis=1) + assert_frame_equal(padded, self.tsframe.fillna(method='pad', axis=1)) + + # mixed type + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + result = self.mixed_frame.replace(np.nan, -1e8, axis=1) + expected = self.mixed_frame.fillna(value=-1e8, axis=1) + assert_frame_equal(result, expected) + + def test_replace_limit(self): + padded = self.tsframe.replace(nan, method='pad', limit=2) + assert_frame_equal(padded, self.tsframe.fillna(method='pad', + limit=2)) + + bfilled = self.tsframe.replace(nan, method='bfill', limit=2) + assert_frame_equal(padded, self.tsframe.fillna(method='bfill', + limit=2)) + + padded = self.tsframe.replace(nan, method='pad', axis=1, limit=2) + assert_frame_equal(padded, self.tsframe.fillna(method='pad', + axis=1, limit=2)) + + bfill = self.tsframe.replace(nan, method='bfill', axis=1, limit=2) + assert_frame_equal(padded, self.tsframe.fillna(method='bfill', + axis=1, limit=2)) + + def test_truncate(self): + offset = datetools.bday + + ts = self.tsframe[::3] + + start, end = self.tsframe.index[3], self.tsframe.index[6] + + start_missing = self.tsframe.index[2] + end_missing = self.tsframe.index[7] + + # neither specified + truncated = ts.truncate() + assert_frame_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + assert_frame_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + assert_frame_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + assert_frame_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + assert_frame_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + assert_frame_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + assert_frame_equal(truncated, expected) + + def test_truncate_copy(self): + index = self.tsframe.index + truncated = self.tsframe.truncate(index[5], index[10]) + truncated.values[:] = 5. + self.assert_(not (self.tsframe.values[5:11] == 5).any()) + + def test_xs(self): + idx = self.frame.index[5] + xs = self.frame.xs(idx) + for item, value in xs.iteritems(): + if np.isnan(value): + self.assert_(np.isnan(self.frame[item][idx])) + else: + self.assertEqual(value, self.frame[item][idx]) + + # mixed-type xs + test_data = { + 'A' : {'1' : 1, '2' : 2}, + 'B' : {'1' : '1', '2' : '2', '3' : '3'}, + } + frame = DataFrame(test_data) + xs = frame.xs('1') + self.assert_(xs.dtype == np.object_) + self.assertEqual(xs['A'], 1) + self.assertEqual(xs['B'], '1') + + self.assertRaises(Exception, self.tsframe.xs, + self.tsframe.index[0] - datetools.bday) + + # xs get column + series = self.frame.xs('A', axis=1) + expected = self.frame['A'] + assert_series_equal(series, expected) + + # no view by default + series[:] = 5 + self.assert_((expected != 5).all()) + + # view + series = self.frame.xs('A', axis=1, copy=False) + series[:] = 5 + self.assert_((expected == 5).all()) + + def test_xs_corner(self): + # pathological mixed-type reordering case + df = DataFrame(index=[0]) + df['A'] = 1. + df['B'] = 'foo' + df['C'] = 2. + df['D'] = 'bar' + df['E'] = 3. + + xs = df.xs(0) + assert_almost_equal(xs, [1., 'foo', 2., 'bar', 3.]) + + # no columns but index + df = DataFrame(index=['a', 'b', 'c']) + result = df.xs('a') + expected = Series([]) + assert_series_equal(result, expected) + + def test_pivot(self): + data = { + 'index' : ['A', 'B', 'C', 'C', 'B', 'A'], + 'columns' : ['One', 'One', 'One', 'Two', 'Two', 'Two'], + 'values' : [1., 2., 3., 3., 2., 1.] + } + + frame = DataFrame(data) + pivoted = frame.pivot(index='index', columns='columns', values='values') + + expected = DataFrame({ + 'One' : {'A' : 1., 'B' : 2., 'C' : 3.}, + 'Two' : {'A' : 1., 'B' : 2., 'C' : 3.} + }) + + assert_frame_equal(pivoted, expected) + + # name tracking + self.assertEqual(pivoted.index.name, 'index') + self.assertEqual(pivoted.columns.name, 'columns') + + # don't specify values + pivoted = frame.pivot(index='index', columns='columns') + self.assertEqual(pivoted.index.name, 'index') + self.assertEqual(pivoted.columns.names, [None, 'columns']) + + # pivot multiple columns + wp = tm.makePanel() + lp = wp.to_frame() + df = lp.reset_index() + assert_frame_equal(df.pivot('major', 'minor'), lp.unstack()) + + def test_pivot_duplicates(self): + data = DataFrame({'a' : ['bar', 'bar', 'foo', 'foo', 'foo'], + 'b' : ['one', 'two', 'one', 'one', 'two'], + 'c' : [1., 2., 3., 3., 4.]}) + self.assertRaises(Exception, data.pivot, 'a', 'b', 'c') + + def test_pivot_empty(self): + df = DataFrame({}, columns=['a', 'b', 'c']) + result = df.pivot('a', 'b', 'c') + expected = DataFrame({}) + assert_frame_equal(result, expected) + + def test_reindex(self): + newFrame = self.frame.reindex(self.ts1.index) + + for col in newFrame.columns: + for idx, val in newFrame[col].iteritems(): + if idx in self.frame.index: + if np.isnan(val): + self.assert_(np.isnan(self.frame[col][idx])) + else: + self.assertEqual(val, self.frame[col][idx]) + else: + self.assert_(np.isnan(val)) + + for col, series in newFrame.iteritems(): + self.assert_(tm.equalContents(series.index, newFrame.index)) + emptyFrame = self.frame.reindex(Index([])) + self.assert_(len(emptyFrame.index) == 0) + + # Cython code should be unit-tested directly + nonContigFrame = self.frame.reindex(self.ts1.index[::2]) + + for col in nonContigFrame.columns: + for idx, val in nonContigFrame[col].iteritems(): + if idx in self.frame.index: + if np.isnan(val): + self.assert_(np.isnan(self.frame[col][idx])) + else: + self.assertEqual(val, self.frame[col][idx]) + else: + self.assert_(np.isnan(val)) + + for col, series in nonContigFrame.iteritems(): + self.assert_(tm.equalContents(series.index, + nonContigFrame.index)) + + # corner cases + + # Same index, copies values + newFrame = self.frame.reindex(self.frame.index) + self.assert_(newFrame.index is self.frame.index) + + # length zero + newFrame = self.frame.reindex([]) + self.assert_(newFrame.empty) + self.assertEqual(len(newFrame.columns), len(self.frame.columns)) + + # length zero with columns reindexed with non-empty index + newFrame = self.frame.reindex([]) + newFrame = newFrame.reindex(self.frame.index) + self.assertEqual(len(newFrame.index), len(self.frame.index)) + self.assertEqual(len(newFrame.columns), len(self.frame.columns)) + + # pass non-Index + newFrame = self.frame.reindex(list(self.ts1.index)) + self.assert_(newFrame.index.equals(self.ts1.index)) + + def test_reindex_name_remains(self): + s = Series(random.rand(10)) + df = DataFrame(s, index=np.arange(len(s))) + i = Series(np.arange(10), name='iname') + df = df.reindex(i) + self.assert_(df.index.name == 'iname') + + df = df.reindex(Index(np.arange(10), name='tmpname')) + self.assert_(df.index.name == 'tmpname') + + s = Series(random.rand(10)) + df = DataFrame(s.T, index=np.arange(len(s))) + i = Series(np.arange(10), name='iname') + df = df.reindex(columns=i) + self.assert_(df.columns.name == 'iname') + + def test_reindex_int(self): + smaller = self.intframe.reindex(self.intframe.index[::2]) + + self.assert_(smaller['A'].dtype == np.int64) + + bigger = smaller.reindex(self.intframe.index) + self.assert_(bigger['A'].dtype == np.float64) + + smaller = self.intframe.reindex(columns=['A', 'B']) + self.assert_(smaller['A'].dtype == np.int64) + + def test_reindex_like(self): + other = self.frame.reindex(index=self.frame.index[:10], + columns=['C', 'B']) + + assert_frame_equal(other, self.frame.reindex_like(other)) + + def test_reindex_columns(self): + newFrame = self.frame.reindex(columns=['A', 'B', 'E']) + + assert_series_equal(newFrame['B'], self.frame['B']) + self.assert_(np.isnan(newFrame['E']).all()) + self.assert_('C' not in newFrame) + + # length zero + newFrame = self.frame.reindex(columns=[]) + self.assert_(newFrame.empty) + + def test_reindex_fill_value(self): + df = DataFrame(np.random.randn(10, 4)) + + # axis=0 + result = df.reindex(range(15)) + self.assert_(np.isnan(result.values[-5:]).all()) + + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + assert_frame_equal(result, expected) + + # axis=1 + result = df.reindex(columns=range(5), fill_value=0.) + expected = df.copy() + expected[4] = 0. + assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value=0) + expected = df.copy() + expected[4] = 0 + assert_frame_equal(result, expected) + + result = df.reindex(columns=range(5), fill_value='foo') + expected = df.copy() + expected[4] = 'foo' + assert_frame_equal(result, expected) + + # reindex_axis + result = df.reindex_axis(range(15), fill_value=0., axis=0) + expected = df.reindex(range(15)).fillna(0) + assert_frame_equal(result, expected) + + result = df.reindex_axis(range(5), fill_value=0., axis=1) + expected = df.reindex(columns=range(5)).fillna(0) + assert_frame_equal(result, expected) + + # other dtypes + df['foo'] = 'foo' + result = df.reindex(range(15), fill_value=0) + expected = df.reindex(range(15)).fillna(0) + assert_frame_equal(result, expected) + + def test_align(self): + af, bf = self.frame.align(self.frame) + self.assert_(af._data is not self.frame._data) + + af, bf = self.frame.align(self.frame, copy=False) + self.assert_(af._data is self.frame._data) + + # axis = 0 + other = self.frame.ix[:-5, :3] + af, bf = self.frame.align(other, axis=0, fill_value=-1) + self.assert_(bf.columns.equals(other.columns)) + #test fill value + join_idx = self.frame.index.join(other.index) + diff_a = self.frame.index.diff(join_idx) + diff_b = other.index.diff(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + self.assert_((diff_a_vals == -1).all()) + + af, bf = self.frame.align(other, join='right', axis=0) + self.assert_(bf.columns.equals(other.columns)) + self.assert_(bf.index.equals(other.index)) + self.assert_(af.index.equals(other.index)) + + # axis = 1 + other = self.frame.ix[:-5, :3].copy() + af, bf = self.frame.align(other, axis=1) + self.assert_(bf.columns.equals(self.frame.columns)) + self.assert_(bf.index.equals(other.index)) + + #test fill value + join_idx = self.frame.index.join(other.index) + diff_a = self.frame.index.diff(join_idx) + diff_b = other.index.diff(join_idx) + diff_a_vals = af.reindex(diff_a).values + diff_b_vals = bf.reindex(diff_b).values + self.assert_((diff_a_vals == -1).all()) + + af, bf = self.frame.align(other, join='inner', axis=1) + self.assert_(bf.columns.equals(other.columns)) + + af, bf = self.frame.align(other, join='inner', axis=1, method='pad') + self.assert_(bf.columns.equals(other.columns)) + + # test other non-float types + af, bf = self.intframe.align(other, join='inner', axis=1, method='pad') + self.assert_(bf.columns.equals(other.columns)) + + af, bf = self.mixed_frame.align(self.mixed_frame, + join='inner', axis=1, method='pad') + self.assert_(bf.columns.equals(self.mixed_frame.columns)) + + af, bf = self.frame.align(other.ix[:,0], join='inner', axis=1, + method=None, fill_value=None) + self.assert_(bf.index.equals(Index([]))) + + af, bf = self.frame.align(other.ix[:,0], join='inner', axis=1, + method=None, fill_value=0) + self.assert_(bf.index.equals(Index([]))) + + # try to align dataframe to series along bad axis + self.assertRaises(ValueError, self.frame.align, af.ix[0,:3], + join='inner', axis=2) + + def test_align_fill_method(self): + def _check_align(a, b, axis, fill_axis, how, method, limit=None): + aa, ab = a.align(b, axis=axis, join=how, method=method, limit=limit, + fill_axis=fill_axis) + + join_index, join_columns = None, None + + ea, eb = a, b + if axis is None or axis == 0: + join_index = a.index.join(b.index, how=how) + ea = ea.reindex(index=join_index) + eb = eb.reindex(index=join_index) + + if axis is None or axis == 1: + join_columns = a.columns.join(b.columns, how=how) + ea = ea.reindex(columns=join_columns) + eb = eb.reindex(columns=join_columns) + + ea = ea.fillna(axis=fill_axis, method=method, limit=limit) + eb = eb.fillna(axis=fill_axis, method=method, limit=limit) + + assert_frame_equal(aa, ea) + assert_frame_equal(ab, eb) + + for kind in JOIN_TYPES: + for meth in ['pad', 'bfill']: + for ax in [0, 1, None]: + for fax in [0, 1]: + left = self.frame.ix[0:4, :10] + right = self.frame.ix[2:, 6:] + empty = self.frame.ix[:0, :0] + + _check_align(left, right, axis=ax, fill_axis=fax, + how=kind, method=meth) + _check_align(left, right, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + # empty left + _check_align(empty, right, axis=ax, fill_axis=fax, + how=kind, method=meth) + _check_align(empty, right, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + + # empty right + _check_align(left, empty, axis=ax, fill_axis=fax, + how=kind, method=meth) + _check_align(left, empty, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + # both empty + _check_align(empty, empty, axis=ax, fill_axis=fax, + how=kind, method=meth) + _check_align(empty, empty, axis=ax, fill_axis=fax, + how=kind, method=meth, limit=1) + + + def test_align_int_fill_bug(self): + # GH #910 + X = np.random.rand(10,10) + Y = np.ones((10,1),dtype=int) + df1 = DataFrame(X) + df1['0.X'] = Y.squeeze() + + df2 = df1.astype(float) + + result = df1 - df1.mean() + expected = df2 - df2.mean() + assert_frame_equal(result, expected) + + #---------------------------------------------------------------------- + # Transposing + + def test_transpose(self): + frame = self.frame + dft = frame.T + for idx, series in dft.iteritems(): + for col, value in series.iteritems(): + if np.isnan(value): + self.assert_(np.isnan(frame[col][idx])) + else: + self.assertEqual(value, frame[col][idx]) + + # mixed type + index, data = tm.getMixedTypeDict() + mixed = DataFrame(data, index=index) + + mixed_T = mixed.T + for col, s in mixed_T.iteritems(): + self.assert_(s.dtype == np.object_) + + def test_transpose_get_view(self): + dft = self.frame.T + dft.values[:, 5:10] = 5 + + self.assert_((self.frame.values[5:10] == 5).all()) + + #---------------------------------------------------------------------- + # Renaming + + def test_rename(self): + mapping = { + 'A' : 'a', + 'B' : 'b', + 'C' : 'c', + 'D' : 'd' + } + + renamed = self.frame.rename(columns=mapping) + renamed2 = self.frame.rename(columns=str.lower) + + assert_frame_equal(renamed, renamed2) + assert_frame_equal(renamed2.rename(columns=str.upper), + self.frame) + + # index + + data = { + 'A' : {'foo' : 0, 'bar' : 1} + } + + # gets sorted alphabetical + df = DataFrame(data) + renamed = df.rename(index={'foo' : 'bar', 'bar' : 'foo'}) + self.assert_(np.array_equal(renamed.index, ['foo', 'bar'])) + + renamed = df.rename(index=str.upper) + self.assert_(np.array_equal(renamed.index, ['BAR', 'FOO'])) + + # have to pass something + self.assertRaises(Exception, self.frame.rename) + + # partial columns + renamed = self.frame.rename(columns={'C' : 'foo', 'D' : 'bar'}) + self.assert_(np.array_equal(renamed.columns, ['A', 'B', 'foo', 'bar'])) + + # other axis + renamed = self.frame.T.rename(index={'C' : 'foo', 'D' : 'bar'}) + self.assert_(np.array_equal(renamed.index, ['A', 'B', 'foo', 'bar'])) + + def test_rename_nocopy(self): + renamed = self.frame.rename(columns={'C' : 'foo'}, copy=False) + renamed['foo'] = 1. + self.assert_((self.frame['C'] == 1.).all()) + + def test_rename_inplace(self): + self.frame.rename(columns={'C' : 'foo'}) + self.assert_('C' in self.frame) + self.assert_('foo' not in self.frame) + + c_id = id(self.frame['C']) + frame = self.frame.copy() + frame.rename(columns={'C' : 'foo'}, inplace=True) + self.assert_('C' not in frame) + self.assert_('foo' in frame) + self.assert_(id(frame['foo']) != c_id) + + + #---------------------------------------------------------------------- + # Time series related + + def test_diff(self): + the_diff = self.tsframe.diff(1) + + assert_series_equal(the_diff['A'], + self.tsframe['A'] - self.tsframe['A'].shift(1)) + + def test_pct_change(self): + rs = self.tsframe.pct_change(fill_method=None) + assert_frame_equal(rs, self.tsframe / self.tsframe.shift(1) - 1) + + rs = self.tsframe.pct_change(2) + filled = self.tsframe.fillna(method='pad') + assert_frame_equal(rs, filled / filled.shift(2) - 1) + + rs = self.tsframe.pct_change(fill_method='bfill', limit=1) + filled = self.tsframe.fillna(method='bfill', limit=1) + assert_frame_equal(rs, filled / filled.shift(1) - 1) + + rs = self.tsframe.pct_change(freq='5D') + filled = self.tsframe.fillna(method='pad') + assert_frame_equal(rs, filled / filled.shift(freq='5D') - 1) + + def test_pct_change_shift_over_nas(self): + s = Series([1., 1.5, np.nan, 2.5, 3.]) + + df = DataFrame({'a': s, 'b': s}) + + chg = df.pct_change() + expected = Series([np.nan, 0.5, np.nan, 2.5/1.5 -1, .2]) + edf = DataFrame({'a': expected, 'b':expected}) + assert_frame_equal(chg, edf) + + def test_shift(self): + # naive shift + shiftedFrame = self.tsframe.shift(5) + self.assert_(shiftedFrame.index.equals(self.tsframe.index)) + + shiftedSeries = self.tsframe['A'].shift(5) + assert_series_equal(shiftedFrame['A'], shiftedSeries) + + shiftedFrame = self.tsframe.shift(-5) + self.assert_(shiftedFrame.index.equals(self.tsframe.index)) + + shiftedSeries = self.tsframe['A'].shift(-5) + assert_series_equal(shiftedFrame['A'], shiftedSeries) + + # shift by 0 + unshifted = self.tsframe.shift(0) + assert_frame_equal(unshifted, self.tsframe) + + # shift by DateOffset + shiftedFrame = self.tsframe.shift(5, freq=datetools.BDay()) + self.assert_(len(shiftedFrame) == len(self.tsframe)) + + shiftedFrame2 = self.tsframe.shift(5, freq='B') + assert_frame_equal(shiftedFrame, shiftedFrame2) + + d = self.tsframe.index[0] + shifted_d = d + datetools.BDay(5) + assert_series_equal(self.tsframe.xs(d), + shiftedFrame.xs(shifted_d)) + + # shift int frame + int_shifted = self.intframe.shift(1) + + # Shifting with PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + self.assert_(shifted.index.equals(ps.index)) + + tm.assert_dict_equal(unshifted.ix[:, 0].valid(), ps.ix[:, 0], + compare_keys=False) + + shifted2 = ps.shift(1, 'B') + shifted3 = ps.shift(1, datetools.bday) + assert_frame_equal(shifted2, shifted3) + assert_frame_equal(ps, shifted2.shift(-1, 'B')) + + self.assertRaises(ValueError, ps.shift, freq='D') + + def test_tshift(self): + # PeriodIndex + ps = tm.makePeriodFrame() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + assert_frame_equal(unshifted, ps) + + shifted2 = ps.tshift(freq='B') + assert_frame_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=datetools.bday) + assert_frame_equal(shifted, shifted3) + + self.assertRaises(ValueError, ps.tshift, freq='M') + + # DatetimeIndex + shifted = self.tsframe.tshift(1) + unshifted = shifted.tshift(-1) + + assert_frame_equal(self.tsframe, unshifted) + + shifted2 = self.tsframe.tshift(freq=self.tsframe.index.freq) + assert_frame_equal(shifted, shifted2) + + inferred_ts = DataFrame(self.tsframe.values, + Index(np.asarray(self.tsframe.index)), + columns=self.tsframe.columns) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + assert_frame_equal(shifted, self.tsframe.tshift(1)) + assert_frame_equal(unshifted, inferred_ts) + + no_freq = self.tsframe.ix[[0, 5, 7], :] + self.assertRaises(ValueError, no_freq.tshift) + + def test_apply(self): + # ufunc + applied = self.frame.apply(np.sqrt) + assert_series_equal(np.sqrt(self.frame['A']), applied['A']) + + # aggregator + applied = self.frame.apply(np.mean) + self.assertEqual(applied['A'], np.mean(self.frame['A'])) + + d = self.frame.index[0] + applied = self.frame.apply(np.mean, axis=1) + self.assertEqual(applied[d], np.mean(self.frame.xs(d))) + self.assert_(applied.index is self.frame.index) # want this + + # empty + applied = self.empty.apply(np.sqrt) + self.assert_(applied.empty) + + applied = self.empty.apply(np.mean) + self.assert_(applied.empty) + + no_rows = self.frame[:0] + result = no_rows.apply(lambda x: x.mean()) + expected = Series(np.nan, index=self.frame.columns) + assert_series_equal(result, expected) + + no_cols = self.frame.ix[:, []] + result = no_cols.apply(lambda x: x.mean(), axis=1) + expected = Series(np.nan, index=self.frame.index) + assert_series_equal(result, expected) + + def test_apply_broadcast(self): + broadcasted = self.frame.apply(np.mean, broadcast=True) + agged = self.frame.apply(np.mean) + + for col, ts in broadcasted.iteritems(): + self.assert_((ts == agged[col]).all()) + + broadcasted = self.frame.apply(np.mean, axis=1, broadcast=True) + agged = self.frame.apply(np.mean, axis=1) + for idx in broadcasted.index: + self.assert_((broadcasted.xs(idx) == agged[idx]).all()) + + def test_apply_raw(self): + result0 = self.frame.apply(np.mean, raw=True) + result1 = self.frame.apply(np.mean, axis=1, raw=True) + + expected0 = self.frame.apply(lambda x: x.values.mean()) + expected1 = self.frame.apply(lambda x: x.values.mean(), axis=1) + + assert_series_equal(result0, expected0) + assert_series_equal(result1, expected1) + + # no reduction + result = self.frame.apply(lambda x: x * 2, raw=True) + expected = self.frame * 2 + assert_frame_equal(result, expected) + + def test_apply_axis1(self): + d = self.frame.index[0] + tapplied = self.frame.apply(np.mean, axis=1) + self.assertEqual(tapplied[d], np.mean(self.frame.xs(d))) + + def test_apply_ignore_failures(self): + result = self.mixed_frame._apply_standard(np.mean, 0, + ignore_failures=True) + expected = self.mixed_frame._get_numeric_data().apply(np.mean) + assert_series_equal(result, expected) + + # test with hierarchical index + + def test_apply_mixed_dtype_corner(self): + df = DataFrame({'A' : ['foo'], + 'B' : [1.]}) + result = df[:0].apply(np.mean, axis=1) + # the result here is actually kind of ambiguous, should it be a Series + # or a DataFrame? + expected = Series(np.nan, index=[]) + assert_series_equal(result, expected) + + def test_apply_empty_infer_type(self): + no_cols = DataFrame(index=['a', 'b', 'c']) + no_index = DataFrame(columns=['a', 'b', 'c']) + + def _check(df, f): + test_res = f(np.array([], dtype='f8')) + is_reduction = not isinstance(test_res, np.ndarray) + + def _checkit(axis=0, raw=False): + res = df.apply(f, axis=axis, raw=raw) + if is_reduction: + agg_axis = df._get_agg_axis(axis) + self.assert_(isinstance(res, Series)) + self.assert_(res.index is agg_axis) + else: + self.assert_(isinstance(res, DataFrame)) + + _checkit() + _checkit(axis=1) + _checkit(raw=True) + _checkit(axis=0, raw=True) + + _check(no_cols, lambda x: x) + _check(no_cols, lambda x: x.mean()) + _check(no_index, lambda x: x) + _check(no_index, lambda x: x.mean()) + + result = no_cols.apply(lambda x: x.mean(), broadcast=True) + self.assert_(isinstance(result, DataFrame)) + + def test_apply_with_args_kwds(self): + def add_some(x, howmuch=0): + return x + howmuch + + def agg_and_add(x, howmuch=0): + return x.mean() + howmuch + + def subtract_and_divide(x, sub, divide=1): + return (x - sub) / divide + + result = self.frame.apply(add_some, howmuch=2) + exp = self.frame.apply(lambda x: x + 2) + assert_frame_equal(result, exp) + + result = self.frame.apply(agg_and_add, howmuch=2) + exp = self.frame.apply(lambda x: x.mean() + 2) + assert_series_equal(result, exp) + + res = self.frame.apply(subtract_and_divide, args=(2,), divide=2) + exp = self.frame.apply(lambda x: (x - 2.) / 2.) + assert_frame_equal(res, exp) + + def test_apply_yield_list(self): + result = self.frame.apply(list) + assert_frame_equal(result, self.frame) + + def test_apply_reduce_Series(self): + self.frame.ix[::2, 'A'] = np.nan + result = self.frame.apply(np.mean, axis=1) + expected = self.frame.mean(1) + assert_series_equal(result, expected) + + def test_apply_differently_indexed(self): + df = DataFrame(np.random.randn(20, 10)) + + result0 = df.apply(Series.describe, axis=0) + expected0 = DataFrame(dict((i, v.describe()) + for i, v in df.iteritems()), + columns=df.columns) + assert_frame_equal(result0, expected0) + + result1 = df.apply(Series.describe, axis=1) + expected1 = DataFrame(dict((i, v.describe()) + for i, v in df.T.iteritems()), + columns=df.index).T + assert_frame_equal(result1, expected1) + + def test_apply_modify_traceback(self): + data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + data['C'][4] = np.nan + + def transform(row): + if row['C'].startswith('shin') and row['A'] == 'foo': + row['D'] = 7 + return row + + def transform2(row): + if (notnull(row['C']) and row['C'].startswith('shin') + and row['A'] == 'foo'): + row['D'] = 7 + return row + + try: + transformed = data.apply(transform, axis=1) + except Exception, e: + self.assertEqual(len(e.args), 2) + self.assertEqual(e.args[1], 'occurred at index 4') + + def test_swapaxes(self): + df = DataFrame(np.random.randn(10, 5)) + assert_frame_equal(df.T, df.swapaxes(0, 1)) + assert_frame_equal(df.T, df.swapaxes(1, 0)) + assert_frame_equal(df, df.swapaxes(0, 0)) + self.assertRaises(ValueError, df.swapaxes, 2, 5) + + def test_apply_convert_objects(self): + data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + result = data.apply(lambda x: x, axis=1) + assert_frame_equal(result, data) + + def test_apply_attach_name(self): + result = self.frame.apply(lambda x: x.name) + expected = Series(self.frame.columns, index=self.frame.columns) + assert_series_equal(result, expected) + + result = self.frame.apply(lambda x: x.name, axis=1) + expected = Series(self.frame.index, index=self.frame.index) + assert_series_equal(result, expected) + + # non-reductions + result = self.frame.apply(lambda x: np.repeat(x.name, len(x))) + expected = DataFrame(np.tile(self.frame.columns, + (len(self.frame.index), 1)), + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(result, expected) + + result = self.frame.apply(lambda x: np.repeat(x.name, len(x)), + axis=1) + expected = DataFrame(np.tile(self.frame.index, + (len(self.frame.columns), 1)).T, + index=self.frame.index, + columns=self.frame.columns) + assert_frame_equal(result, expected) + + def test_applymap(self): + applied = self.frame.applymap(lambda x: x * 2) + assert_frame_equal(applied, self.frame * 2) + result = self.frame.applymap(type) + + # GH #465, function returning tuples + result = self.frame.applymap(lambda x: (x, x)) + self.assert_(isinstance(result['A'][0], tuple)) + + def test_filter(self): + # items + + filtered = self.frame.filter(['A', 'B', 'E']) + self.assertEqual(len(filtered.columns), 2) + self.assert_('E' not in filtered) + + # like + fcopy = self.frame.copy() + fcopy['AA'] = 1 + + filtered = fcopy.filter(like='A') + self.assertEqual(len(filtered.columns), 2) + self.assert_('AA' in filtered) + + # regex + filtered = fcopy.filter(regex='[A]+') + self.assertEqual(len(filtered.columns), 2) + self.assert_('AA' in filtered) + + # pass in None + self.assertRaises(Exception, self.frame.filter, items=None) + + # objects + filtered = self.mixed_frame.filter(like='foo') + self.assert_('foo' in filtered) + + def test_filter_corner(self): + empty = DataFrame() + + result = empty.filter([]) + assert_frame_equal(result, empty) + + result = empty.filter(like='foo') + assert_frame_equal(result, empty) + + def test_select(self): + f = lambda x: x.weekday() == 2 + result = self.tsframe.select(f, axis=0) + expected = self.tsframe.reindex( + index=self.tsframe.index[[f(x) for x in self.tsframe.index]]) + assert_frame_equal(result, expected) + + result = self.frame.select(lambda x: x in ('B', 'D'), axis=1) + expected = self.frame.reindex(columns=['B', 'D']) + assert_frame_equal(result, expected) + + def test_sort_index(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + # axis=0 + unordered = frame.ix[[3, 2, 4, 1]] + sorted_df = unordered.sort_index() + expected = frame + assert_frame_equal(sorted_df, expected) + + sorted_df = unordered.sort_index(ascending=False) + expected = frame[::-1] + assert_frame_equal(sorted_df, expected) + + # axis=1 + unordered = frame.ix[:, ['D', 'B', 'C', 'A']] + sorted_df = unordered.sort_index(axis=1) + expected = frame + assert_frame_equal(sorted_df, expected) + + sorted_df = unordered.sort_index(axis=1, ascending=False) + expected = frame.ix[:, ::-1] + assert_frame_equal(sorted_df, expected) + + # by column + sorted_df = frame.sort_index(by='A') + indexer = frame['A'].argsort().values + expected = frame.ix[frame.index[indexer]] + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort_index(by='A', ascending=False) + indexer = indexer[::-1] + expected = frame.ix[frame.index[indexer]] + assert_frame_equal(sorted_df, expected) + + # check for now + sorted_df = frame.sort(columns='A') + expected = frame.sort_index(by='A') + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort(columns='A', ascending=False) + expected = frame.sort_index(by='A', ascending=False) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.sort(columns=['A', 'B'], ascending=False) + expected = frame.sort_index(by=['A', 'B'], ascending=False) + assert_frame_equal(sorted_df, expected) + + self.assertRaises(ValueError, frame.sort_index, axis=2, inplace=True) + + def test_sort_index_multicolumn(self): + import random + A = np.arange(5).repeat(20) + B = np.tile(np.arange(5), 20) + random.shuffle(A) + random.shuffle(B) + frame = DataFrame({'A' : A, 'B' : B, + 'C' : np.random.randn(100)}) + + result = frame.sort_index(by=['A', 'B']) + indexer = np.lexsort((frame['B'], frame['A'])) + expected = frame.take(indexer) + assert_frame_equal(result, expected) + + result = frame.sort_index(by=['A', 'B'], ascending=False) + expected = frame.take(indexer[::-1]) + assert_frame_equal(result, expected) + + result = frame.sort_index(by=['B', 'A']) + indexer = np.lexsort((frame['A'], frame['B'])) + expected = frame.take(indexer) + assert_frame_equal(result, expected) + + def test_sort_index_inplace(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + # axis=0 + unordered = frame.ix[[3, 2, 4, 1]] + a_id = id(unordered['A']) + df = unordered.copy() + df.sort_index(inplace=True) + expected = frame + assert_frame_equal(df, expected) + self.assert_(a_id != id(df['A'])) + + df = unordered.copy() + df.sort_index(ascending=False, inplace=True) + expected = frame[::-1] + assert_frame_equal(df, expected) + + # axis=1 + unordered = frame.ix[:, ['D', 'B', 'C', 'A']] + df = unordered.copy() + df.sort_index(axis=1, inplace=True) + expected = frame + assert_frame_equal(df, expected) + + df = unordered.copy() + df.sort_index(axis=1, ascending=False, inplace=True) + expected = frame.ix[:, ::-1] + assert_frame_equal(df, expected) + + def test_sort_inplace(self): + frame = DataFrame(np.random.randn(4, 4), index=[1, 2, 3, 4], + columns=['A', 'B', 'C', 'D']) + + sorted_df = frame.copy() + sorted_df.sort(columns='A', inplace=True) + expected = frame.sort_index(by='A') + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort(columns='A', ascending=False, inplace=True) + expected = frame.sort_index(by='A', ascending=False) + assert_frame_equal(sorted_df, expected) + + sorted_df = frame.copy() + sorted_df.sort(columns=['A', 'B'], ascending=False, inplace=True) + expected = frame.sort_index(by=['A', 'B'], ascending=False) + assert_frame_equal(sorted_df, expected) + + def test_frame_column_inplace_sort_exception(self): + s = self.frame['A'] + self.assertRaises(Exception, s.sort) + + cp = s.copy() + cp.sort() # it works! + + def test_combine_first(self): + # disjoint + head, tail = self.frame[:5], self.frame[5:] + + combined = head.combine_first(tail) + reordered_frame = self.frame.reindex(combined.index) + assert_frame_equal(combined, reordered_frame) + self.assert_(tm.equalContents(combined.columns, self.frame.columns)) + assert_series_equal(combined['A'], reordered_frame['A']) + + # same index + fcopy = self.frame.copy() + fcopy['A'] = 1 + del fcopy['C'] + + fcopy2 = self.frame.copy() + fcopy2['B'] = 0 + del fcopy2['D'] + + combined = fcopy.combine_first(fcopy2) + + self.assert_((combined['A'] == 1).all()) + assert_series_equal(combined['B'], fcopy['B']) + assert_series_equal(combined['C'], fcopy2['C']) + assert_series_equal(combined['D'], fcopy['D']) + + # overlap + head, tail = reordered_frame[:10].copy(), reordered_frame + head['A'] = 1 + + combined = head.combine_first(tail) + self.assert_((combined['A'][:10] == 1).all()) + + # reverse overlap + tail['A'][:10] = 0 + combined = tail.combine_first(head) + self.assert_((combined['A'][:10] == 0).all()) + + # no overlap + f = self.frame[:10] + g = self.frame[10:] + combined = f.combine_first(g) + assert_series_equal(combined['A'].reindex(f.index), f['A']) + assert_series_equal(combined['A'].reindex(g.index), g['A']) + + # corner cases + comb = self.frame.combine_first(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combine_first(self.frame) + assert_frame_equal(comb, self.frame) + + def test_combine_first_mixed_bug(self): + idx = Index(['a','b','c','e']) + ser1 = Series([5.0,-9.0,4.0,100.],index=idx) + ser2 = Series(['a', 'b', 'c', 'e'], index=idx) + ser3 = Series([12,4,5,97], index=idx) + + frame1 = DataFrame({"col0" : ser1, + "col2" : ser2, + "col3" : ser3}) + + idx = Index(['a','b','c','f']) + ser1 = Series([5.0,-9.0,4.0,100.], index=idx) + ser2 = Series(['a','b','c','f'], index=idx) + ser3 = Series([12,4,5,97],index=idx) + + frame2 = DataFrame({"col1" : ser1, + "col2" : ser2, + "col5" : ser3}) + + + combined = frame1.combine_first(frame2) + self.assertEqual(len(combined.columns), 5) + + def test_update(self): + df = DataFrame([[1.5, nan, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other) + + expected = DataFrame([[1.5, nan, 3], + [3.6, 2, 3], + [1.5, nan, 3], + [1.5, nan, 7.]]) + assert_frame_equal(df, expected) + + def test_update_nooverwrite(self): + df = DataFrame([[1.5, nan, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, overwrite=False) + + expected = DataFrame([[1.5, nan, 3], + [1.5, 2, 3], + [1.5, nan, 3], + [1.5, nan, 3.]]) + assert_frame_equal(df, expected) + + def test_update_filtered(self): + df = DataFrame([[1.5, nan, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[3.6, 2., np.nan], + [np.nan, np.nan, 7]], index=[1, 3]) + + df.update(other, filter_func=lambda x: x > 2) + + expected = DataFrame([[1.5, nan, 3], + [1.5, nan, 3], + [1.5, nan, 3], + [1.5, nan, 7.]]) + assert_frame_equal(df, expected) + + def test_update_raise(self): + df = DataFrame([[1.5, 1, 3.], + [1.5, nan, 3.], + [1.5, nan, 3], + [1.5, nan, 3]]) + + other = DataFrame([[2., nan], + [nan, 7]], index=[1, 3], columns=[1,2]) + + np.testing.assert_raises(Exception, df.update, *(other,), + **{'raise_conflict' : True}) + + def test_combineAdd(self): + # trivial + comb = self.frame.combineAdd(self.frame) + assert_frame_equal(comb, self.frame * 2) + + # more rigorous + a = DataFrame([[1., nan, nan, 2., nan]], + columns=np.arange(5)) + b = DataFrame([[2., 3., nan, 2., 6., nan]], + columns=np.arange(6)) + expected = DataFrame([[3., 3., nan, 4., 6., nan]], + columns=np.arange(6)) + + result = a.combineAdd(b) + assert_frame_equal(result, expected) + result2 = a.T.combineAdd(b.T) + assert_frame_equal(result2, expected.T) + + expected2 = a.combine(b, operator.add, fill_value=0.) + assert_frame_equal(expected, expected2) + + # corner cases + comb = self.frame.combineAdd(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combineAdd(self.frame) + assert_frame_equal(comb, self.frame) + + # integer corner case + df1 = DataFrame({'x':[5]}) + df2 = DataFrame({'x':[1]}) + df3 = DataFrame({'x':[6]}) + comb = df1.combineAdd(df2) + assert_frame_equal(comb, df3) + + # TODO: test integer fill corner? + + def test_combineMult(self): + # trivial + comb = self.frame.combineMult(self.frame) + + assert_frame_equal(comb, self.frame ** 2) + + # corner cases + comb = self.frame.combineMult(self.empty) + assert_frame_equal(comb, self.frame) + + comb = self.empty.combineMult(self.frame) + assert_frame_equal(comb, self.frame) + + def test_combine_generic(self): + df1 = self.frame + df2 = self.frame.ix[:-5, ['A', 'B', 'C']] + + combined = df1.combine(df2, np.add) + combined2 = df2.combine(df1, np.add) + self.assert_(combined['D'].isnull().all()) + self.assert_(combined2['D'].isnull().all()) + + chunk = combined.ix[:-5, ['A', 'B', 'C']] + chunk2 = combined2.ix[:-5, ['A', 'B', 'C']] + + exp = self.frame.ix[:-5, ['A', 'B', 'C']].reindex_like(chunk) * 2 + assert_frame_equal(chunk, exp) + assert_frame_equal(chunk2, exp) + + def test_clip(self): + median = self.frame.median().median() + + capped = self.frame.clip_upper(median) + self.assert_(not (capped.values > median).any()) + + floored = self.frame.clip_lower(median) + self.assert_(not (floored.values < median).any()) + + double = self.frame.clip(upper=median, lower=median) + self.assert_(not (double.values != median).any()) + + def test_get_X_columns(self): + # numeric and object columns + + # Booleans get casted to float in DataFrame, so skip for now + df = DataFrame({'a' : [1, 2, 3], + # 'b' : [True, False, True], + 'c' : ['foo', 'bar', 'baz'], + 'd' : [None, None, None], + 'e' : [3.14, 0.577, 2.773]}) + + self.assert_(np.array_equal(df._get_numeric_data().columns, + ['a', 'e'])) + + def test_get_numeric_data(self): + df = DataFrame({'a' : 1., 'b' : 2, 'c' : 'foo'}, + index=np.arange(10)) + + result = df._get_numeric_data() + expected = df.ix[:, ['a', 'b']] + assert_frame_equal(result, expected) + + only_obj = df.ix[:, ['c']] + result = only_obj._get_numeric_data() + expected = df.ix[:, []] + assert_frame_equal(result, expected) + + def test_count(self): + f = lambda s: notnull(s).sum() + self._check_stat_op('count', f, + has_skipna=False, + has_numeric_only=True) + + # corner case + frame = DataFrame() + ct1 = frame.count(1) + self.assert_(isinstance(ct1, Series)) + + ct2 = frame.count(0) + self.assert_(isinstance(ct2, Series)) + + # GH #423 + df = DataFrame(index=range(10)) + result = df.count(1) + expected = Series(0, index=df.index) + assert_series_equal(result, expected) + + df = DataFrame(columns=range(10)) + result = df.count(0) + expected = Series(0, index=df.columns) + assert_series_equal(result, expected) + + df = DataFrame() + result = df.count() + expected = Series(0, index=[]) + assert_series_equal(result, expected) + + def test_sum(self): + self._check_stat_op('sum', np.sum, has_numeric_only=True) + + def test_stat_operators_attempt_obj_array(self): + data = { + 'a': [-0.00049987540199591344, -0.0016467257772919831, + 0.00067695870775883013], + 'b': [-0, -0, 0.0], + 'c': [0.00031111847529610595, 0.0014902627951905339, + -0.00094099200035979691] + } + df1 = DataFrame(data, index=['foo', 'bar', 'baz'], + dtype='O') + methods = ['sum', 'mean', 'prod', 'var', 'std', 'skew', 'min', 'max'] + + # GH #676 + df2 = DataFrame({0: [np.nan, 2], 1: [np.nan, 3], + 2: [np.nan, 4]}, dtype=object) + + for df in [df1, df2]: + for meth in methods: + self.assert_(df.values.dtype == np.object_) + result = getattr(df, meth)(1) + expected = getattr(df.astype('f8'), meth)(1) + assert_series_equal(result, expected) + + def test_mean(self): + self._check_stat_op('mean', np.mean) + + def test_product(self): + self._check_stat_op('product', np.prod) + + def test_median(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper) + + def test_min(self): + self._check_stat_op('min', np.min) + self._check_stat_op('min', np.min, frame=self.intframe) + + def test_cummin(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cummin = self.tsframe.cummin() + expected = self.tsframe.apply(Series.cummin) + assert_frame_equal(cummin, expected) + + # axis = 1 + cummin = self.tsframe.cummin(axis=1) + expected = self.tsframe.apply(Series.cummin, axis=1) + assert_frame_equal(cummin, expected) + + # works + df = DataFrame({'A' : np.arange(20)}, index=np.arange(20)) + result = df.cummin() + + # fix issue + cummin_xs = self.tsframe.cummin(axis=1) + self.assertEqual(np.shape(cummin_xs), np.shape(self.tsframe)) + + def test_cummax(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cummax = self.tsframe.cummax() + expected = self.tsframe.apply(Series.cummax) + assert_frame_equal(cummax, expected) + + # axis = 1 + cummax = self.tsframe.cummax(axis=1) + expected = self.tsframe.apply(Series.cummax, axis=1) + assert_frame_equal(cummax, expected) + + # works + df = DataFrame({'A' : np.arange(20)}, index=np.arange(20)) + result = df.cummax() + + # fix issue + cummax_xs = self.tsframe.cummax(axis=1) + self.assertEqual(np.shape(cummax_xs), np.shape(self.tsframe)) + + + def test_max(self): + self._check_stat_op('max', np.max) + self._check_stat_op('max', np.max, frame=self.intframe) + + def test_mad(self): + f = lambda x: np.abs(x - x.mean()).mean() + self._check_stat_op('mad', f) + + def test_var_std(self): + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op('var', alt) + + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op('std', alt) + + result = self.tsframe.std(ddof=4) + expected = self.tsframe.apply(lambda x: x.std(ddof=4)) + assert_almost_equal(result, expected) + + result = self.tsframe.var(ddof=4) + expected = self.tsframe.apply(lambda x: x.var(ddof=4)) + assert_almost_equal(result, expected) + + arr = np.repeat(np.random.random((1, 1000)), 1000, 0) + result = nanops.nanvar(arr, axis=0) + self.assertFalse((result < 0).any()) + if nanops._USE_BOTTLENECK: + nanops._USE_BOTTLENECK = False + result = nanops.nanvar(arr, axis=0) + self.assertFalse((result < 0).any()) + nanops._USE_BOTTLENECK = True + + def test_skew(self): + _skip_if_no_scipy() + from scipy.stats import skew + + def alt(x): + if len(x) < 3: + return np.nan + return skew(x, bias=False) + + self._check_stat_op('skew', alt) + + def test_kurt(self): + _skip_if_no_scipy() + + from scipy.stats import kurtosis + + def alt(x): + if len(x) < 4: + return np.nan + return kurtosis(x, bias=False) + + self._check_stat_op('kurt', alt) + + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + df = DataFrame(np.random.randn(6, 3), index=index) + assert_series_equal(df.kurt(), df.kurt(level=0).xs('bar')) + + def _check_stat_op(self, name, alternative, frame=None, has_skipna=True, + has_numeric_only=False): + if frame is None: + frame = self.frame + # set some NAs + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + + f = getattr(frame, name) + + if has_skipna: + def skipna_wrapper(x): + nona = x.dropna().values + if len(nona) == 0: + return np.nan + return alternative(nona) + + def wrapper(x): + return alternative(x.values) + + result0 = f(axis=0, skipna=False) + result1 = f(axis=1, skipna=False) + assert_series_equal(result0, frame.apply(wrapper)) + assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False) # HACK: win32 + else: + skipna_wrapper = alternative + wrapper = alternative + + result0 = f(axis=0) + result1 = f(axis=1) + assert_series_equal(result0, frame.apply(skipna_wrapper)) + assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + check_dtype=False) + + # result = f(axis=1) + # comp = frame.apply(alternative, axis=1).reindex(result.index) + # assert_series_equal(result, comp) + + self.assertRaises(Exception, f, axis=2) + + # make sure works on mixed-type frame + getattr(self.mixed_frame, name)(axis=0) + getattr(self.mixed_frame, name)(axis=1) + + if has_numeric_only: + getattr(self.mixed_frame, name)(axis=0, numeric_only=True) + getattr(self.mixed_frame, name)(axis=1, numeric_only=True) + getattr(self.frame, name)(axis=0, numeric_only=False) + getattr(self.frame, name)(axis=1, numeric_only=False) + + # all NA case + if has_skipna: + all_na = self.frame * np.NaN + r0 = getattr(all_na, name)(axis=0) + r1 = getattr(all_na, name)(axis=1) + self.assert_(np.isnan(r0).all()) + self.assert_(np.isnan(r1).all()) + + def test_sum_corner(self): + axis0 = self.empty.sum(0) + axis1 = self.empty.sum(1) + self.assert_(isinstance(axis0, Series)) + self.assert_(isinstance(axis1, Series)) + self.assertEquals(len(axis0), 0) + self.assertEquals(len(axis1), 0) + + def test_sum_object(self): + values = self.frame.values.astype(int) + frame = DataFrame(values, index=self.frame.index, + columns=self.frame.columns) + deltas = frame * timedelta(1) + deltas.sum() + + def test_sum_bool(self): + # ensure this works, bug report + bools = np.isnan(self.frame) + bools.sum(1) + bools.sum(0) + + def test_mean_corner(self): + # unit test when have object data + the_mean = self.mixed_frame.mean(axis=0) + the_sum = self.mixed_frame.sum(axis=0, numeric_only=True) + self.assert_(the_sum.index.equals(the_mean.index)) + self.assert_(len(the_mean.index) < len(self.mixed_frame.columns)) + + # xs sum mixed type, just want to know it works... + the_mean = self.mixed_frame.mean(axis=1) + the_sum = self.mixed_frame.sum(axis=1, numeric_only=True) + self.assert_(the_sum.index.equals(the_mean.index)) + + # take mean of boolean column + self.frame['bool'] = self.frame['A'] > 0 + means = self.frame.mean(0) + self.assertEqual(means['bool'], self.frame['bool'].values.mean()) + + def test_stats_mixed_type(self): + # don't blow up + self.mixed_frame.std(1) + self.mixed_frame.var(1) + self.mixed_frame.mean(1) + self.mixed_frame.skew(1) + + def test_median_corner(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper, frame=self.intframe) + + def test_quantile(self): + from pandas.compat.scipy import scoreatpercentile + + q = self.tsframe.quantile(0.1, axis=0) + self.assertEqual(q['A'], scoreatpercentile(self.tsframe['A'], 10)) + q = self.tsframe.quantile(0.9, axis=1) + q = self.intframe.quantile(0.1) + self.assertEqual(q['A'], scoreatpercentile(self.intframe['A'], 10)) + + # test degenerate case + q = DataFrame({'x':[],'y':[]}).quantile(0.1, axis=0) + assert(np.isnan(q['x']) and np.isnan(q['y'])) + + def test_cumsum(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cumsum = self.tsframe.cumsum() + expected = self.tsframe.apply(Series.cumsum) + assert_frame_equal(cumsum, expected) + + # axis = 1 + cumsum = self.tsframe.cumsum(axis=1) + expected = self.tsframe.apply(Series.cumsum, axis=1) + assert_frame_equal(cumsum, expected) + + # works + df = DataFrame({'A' : np.arange(20)}, index=np.arange(20)) + result = df.cumsum() + + # fix issue + cumsum_xs = self.tsframe.cumsum(axis=1) + self.assertEqual(np.shape(cumsum_xs), np.shape(self.tsframe)) + + def test_cumprod(self): + self.tsframe.ix[5:10, 0] = nan + self.tsframe.ix[10:15, 1] = nan + self.tsframe.ix[15:, 2] = nan + + # axis = 0 + cumprod = self.tsframe.cumprod() + expected = self.tsframe.apply(Series.cumprod) + assert_frame_equal(cumprod, expected) + + # axis = 1 + cumprod = self.tsframe.cumprod(axis=1) + expected = self.tsframe.apply(Series.cumprod, axis=1) + assert_frame_equal(cumprod, expected) + + # fix issue + cumprod_xs = self.tsframe.cumprod(axis=1) + self.assertEqual(np.shape(cumprod_xs), np.shape(self.tsframe)) + + # ints + df = self.tsframe.fillna(0).astype(int) + df.cumprod(0) + df.cumprod(1) + + def test_rank(self): + from pandas.compat.scipy import rankdata + + self.frame['A'][::2] = np.nan + self.frame['B'][::3] = np.nan + self.frame['C'][::4] = np.nan + self.frame['D'][::5] = np.nan + + ranks0 = self.frame.rank() + ranks1 = self.frame.rank(1) + mask = np.isnan(self.frame.values) + + fvals = self.frame.fillna(np.inf).values + + exp0 = np.apply_along_axis(rankdata, 0, fvals) + exp0[mask] = np.nan + + exp1 = np.apply_along_axis(rankdata, 1, fvals) + exp1[mask] = np.nan + + assert_almost_equal(ranks0.values, exp0) + assert_almost_equal(ranks1.values, exp1) + + def test_rank2(self): + from datetime import datetime + + df = DataFrame([['b','c','a'],['a','c','b']]) + expected = DataFrame([[2.0, 3.0, 1.0], [1, 3, 2]]) + result = df.rank(1, numeric_only=False) + assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, 1.5, 1.0], [1, 1.5, 2]]) + result = df.rank(0, numeric_only=False) + assert_frame_equal(result, expected) + + df = DataFrame([['b',np.nan,'a'],['a','c','b']]) + expected = DataFrame([[2.0, nan, 1.0], [1.0, 3.0, 2.0]]) + result = df.rank(1, numeric_only=False) + assert_frame_equal(result, expected) + + expected = DataFrame([[2.0, nan, 1.0], [1.0, 1.0, 2.0]]) + result = df.rank(0, numeric_only=False) + assert_frame_equal(result, expected) + + # f7u12, this does not work without extensive workaround + data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], + [datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 1)]] + df = DataFrame(data) + expected = DataFrame([[2., nan, 1.], + [2., 3., 1.]]) + result = df.rank(1, numeric_only=False) + assert_frame_equal(result, expected) + + # mixed-type frames + self.mixed_frame['foo'] = datetime.now() + result = self.mixed_frame.rank(1) + expected = self.mixed_frame.rank(1, numeric_only=True) + assert_frame_equal(result, expected) + + def test_describe(self): + desc = self.tsframe.describe() + desc = self.mixed_frame.describe() + desc = self.frame.describe() + + def test_describe_percentiles(self): + desc = self.frame.describe(percentile_width=50) + assert '75%' in desc.index + assert '25%' in desc.index + + desc = self.frame.describe(percentile_width=95) + assert '97.5%' in desc.index + assert '2.5%' in desc.index + + def test_describe_no_numeric(self): + df = DataFrame({'A' : ['foo', 'foo', 'bar'] * 8, + 'B' : ['a', 'b', 'c', 'd'] * 6}) + desc = df.describe() + expected = DataFrame(dict((k, v.describe()) + for k, v in df.iteritems()), + columns=df.columns) + assert_frame_equal(desc, expected) + + df = DataFrame({'time' : self.tsframe.index}) + desc = df.describe() + assert(desc.time['first'] == min(self.tsframe.index)) + + def test_get_axis_etc(self): + f = self.frame + + self.assertEquals(f._get_axis_number(0), 0) + self.assertEquals(f._get_axis_number(1), 1) + self.assertEquals(f._get_axis_name(0), 'index') + self.assertEquals(f._get_axis_name(1), 'columns') + + self.assert_(f._get_axis(0) is f.index) + self.assert_(f._get_axis(1) is f.columns) + self.assertRaises(Exception, f._get_axis_number, 2) + + def test_combine_first_mixed(self): + a = Series(['a','b'], index=range(2)) + b = Series(range(2), index=range(2)) + f = DataFrame({'A' : a, 'B' : b}) + + a = Series(['a','b'], index=range(5, 7)) + b = Series(range(2), index=range(5, 7)) + g = DataFrame({'A' : a, 'B' : b}) + + combined = f.combine_first(g) + + def test_more_asMatrix(self): + values = self.mixed_frame.as_matrix() + self.assertEqual(values.shape[1], len(self.mixed_frame.columns)) + + def test_reindex_boolean(self): + frame = DataFrame(np.ones((10, 2), dtype=bool), + index=np.arange(0, 20, 2), + columns=[0, 2]) + + reindexed = frame.reindex(np.arange(10)) + self.assert_(reindexed.values.dtype == np.object_) + self.assert_(isnull(reindexed[0][1])) + + reindexed = frame.reindex(columns=range(3)) + self.assert_(reindexed.values.dtype == np.object_) + self.assert_(isnull(reindexed[1]).all()) + + def test_reindex_objects(self): + reindexed = self.mixed_frame.reindex(columns=['foo', 'A', 'B']) + self.assert_('foo' in reindexed) + + reindexed = self.mixed_frame.reindex(columns=['A', 'B']) + self.assert_('foo' not in reindexed) + + def test_reindex_corner(self): + index = Index(['a', 'b', 'c']) + dm = self.empty.reindex(index=[1, 2, 3]) + reindexed = dm.reindex(columns=index) + self.assert_(reindexed.columns.equals(index)) + + # ints are weird + + smaller = self.intframe.reindex(columns=['A', 'B', 'E']) + self.assert_(smaller['E'].dtype == np.float64) + + def test_reindex_axis(self): + cols = ['A', 'B', 'E'] + reindexed1 = self.intframe.reindex_axis(cols, axis=1) + reindexed2 = self.intframe.reindex(columns=cols) + assert_frame_equal(reindexed1, reindexed2) + + rows = self.intframe.index[0:5] + reindexed1 = self.intframe.reindex_axis(rows, axis=0) + reindexed2 = self.intframe.reindex(index=rows) + assert_frame_equal(reindexed1, reindexed2) + + self.assertRaises(ValueError, self.intframe.reindex_axis, rows, axis=2) + + # no-op case + cols = self.frame.columns.copy() + newFrame = self.frame.reindex_axis(cols, axis=1) + assert_frame_equal(newFrame, self.frame) + + def test_reindex_with_nans(self): + df = DataFrame([[1,2], [3,4], [np.nan,np.nan], [7,8], [9,10]], + columns=['a', 'b'], + index=[100.0, 101.0, np.nan, 102.0, 103.0]) + + result = df.reindex(index=[101.0, 102.0, 103.0]) + expected = df.ix[[1, 3, 4]] + assert_frame_equal(result, expected) + + result = df.reindex(index=[103.0]) + expected = df.ix[[4]] + assert_frame_equal(result, expected) + + result = df.reindex(index=[101.0]) + expected = df.ix[[1]] + assert_frame_equal(result, expected) + + def test_reindex_multi(self): + df = DataFrame(np.random.randn(3, 3)) + + result = df.reindex(range(4), range(4)) + expected = df.reindex(range(4)).reindex(columns=range(4)) + + assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(range(4), range(4)) + expected = df.reindex(range(4)).reindex(columns=range(4)) + + assert_frame_equal(result, expected) + + df = DataFrame(np.random.randint(0, 10, (3, 3))) + + result = df.reindex(range(2), range(2)) + expected = df.reindex(range(2)).reindex(columns=range(2)) + + assert_frame_equal(result, expected) + + df = DataFrame(np.random.randn(5, 3) + 1j, columns=['a','b','c']) + + result = df.reindex(index=[0,1], columns=['a', 'b']) + expected = df.reindex([0, 1]).reindex(columns=['a', 'b']) + + assert_frame_equal(result, expected) + + def test_rename_objects(self): + renamed = self.mixed_frame.rename(columns=str.upper) + self.assert_('FOO' in renamed) + self.assert_('foo' not in renamed) + + def test_fill_corner(self): + self.mixed_frame['foo'][5:20] = nan + self.mixed_frame['A'][-10:] = nan + + filled = self.mixed_frame.fillna(value=0) + self.assert_((filled['foo'][5:20] == 0).all()) + del self.mixed_frame['foo'] + + empty_float = self.frame.reindex(columns=[]) + result = empty_float.fillna(value=0) + + def test_count_objects(self): + dm = DataFrame(self.mixed_frame._series) + df = DataFrame(self.mixed_frame._series) + + tm.assert_series_equal(dm.count(), df.count()) + tm.assert_series_equal(dm.count(1), df.count(1)) + + def test_cumsum_corner(self): + dm = DataFrame(np.arange(20).reshape(4, 5), + index=range(4), columns=range(5)) + result = dm.cumsum() + + #---------------------------------------------------------------------- + # Stacking / unstacking + + def test_stack_unstack(self): + stacked = self.frame.stack() + stacked_df = DataFrame({'foo' : stacked, 'bar' : stacked}) + + unstacked = stacked.unstack() + unstacked_df = stacked_df.unstack() + + assert_frame_equal(unstacked, self.frame) + assert_frame_equal(unstacked_df['bar'], self.frame) + + unstacked_cols = stacked.unstack(0) + unstacked_cols_df = stacked_df.unstack(0) + assert_frame_equal(unstacked_cols.T, self.frame) + assert_frame_equal(unstacked_cols_df['bar'].T, self.frame) + + def test_unstack_to_series(self): + # check reversibility + data = self.frame.unstack() + + self.assertTrue(isinstance(data, Series)) + undo = data.unstack().T + assert_frame_equal(undo, self.frame) + + # check NA handling + data = DataFrame({'x': [1, 2, np.NaN], 'y': [3.0, 4, np.NaN]}) + data.index = Index(['a','b','c']) + result = data.unstack() + + midx = MultiIndex(levels=[['x','y'],['a','b','c']], + labels=[[0,0,0,1,1,1],[0,1,2,0,1,2]]) + expected = Series([1,2,np.NaN,3,4,np.NaN], index=midx) + + assert_series_equal(result, expected) + + # check composability of unstack + old_data = data.copy() + for _ in xrange(4): + data = data.unstack() + assert_frame_equal(old_data, data) + + def test_reset_index(self): + stacked = self.frame.stack()[::2] + stacked = DataFrame({'foo' : stacked, 'bar' : stacked}) + + names = ['first', 'second'] + stacked.index.names = names + deleveled = stacked.reset_index() + for i, (lev, lab) in enumerate(zip(stacked.index.levels, + stacked.index.labels)): + values = lev.take(lab) + name = names[i] + assert_almost_equal(values, deleveled[name]) + + stacked.index.names = [None, None] + deleveled2 = stacked.reset_index() + self.assert_(np.array_equal(deleveled['first'], + deleveled2['level_0'])) + self.assert_(np.array_equal(deleveled['second'], + deleveled2['level_1'])) + + # default name assigned + rdf = self.frame.reset_index() + self.assert_(np.array_equal(rdf['index'], self.frame.index.values)) + + # default name assigned, corner case + df = self.frame.copy() + df['index'] = 'foo' + rdf = df.reset_index() + self.assert_(np.array_equal(rdf['level_0'], self.frame.index.values)) + + # but this is ok + self.frame.index.name = 'index' + deleveled = self.frame.reset_index() + self.assert_(np.array_equal(deleveled['index'], + self.frame.index.values)) + self.assert_(np.array_equal(deleveled.index, + np.arange(len(deleveled)))) + + # preserve column names + self.frame.columns.name = 'columns' + resetted = self.frame.reset_index() + self.assertEqual(resetted.columns.name, 'columns') + + def test_reset_index_right_dtype(self): + time = np.arange(0.0, 10, np.sqrt(2)/2) + s1 = Series((9.81 * time ** 2) /2, + index=Index(time, name='time'), + name='speed') + df = DataFrame(s1) + + resetted = s1.reset_index() + self.assert_(resetted['time'].dtype == np.float64) + + resetted = df.reset_index() + self.assert_(resetted['time'].dtype == np.float64) + + #---------------------------------------------------------------------- + # Tests to cope with refactored internals + + def test_as_matrix_numeric_cols(self): + self.frame['foo'] = 'bar' + + values = self.frame.as_matrix(['A', 'B', 'C', 'D']) + self.assert_(values.dtype == np.float64) + + def test_constructor_frame_copy(self): + cop = DataFrame(self.frame, copy=True) + cop['A'] = 5 + self.assert_((cop['A'] == 5).all()) + self.assert_(not (self.frame['A'] == 5).all()) + + def test_constructor_ndarray_copy(self): + df = DataFrame(self.frame.values) + + self.frame.values[5] = 5 + self.assert_((df.values[5] == 5).all()) + + df = DataFrame(self.frame.values, copy=True) + self.frame.values[6] = 6 + self.assert_(not (df.values[6] == 6).all()) + + def test_constructor_series_copy(self): + series = self.frame._series + + df = DataFrame({'A' : series['A']}) + df['A'][:] = 5 + + self.assert_(not (series['A'] == 5).all()) + + def test_assign_columns(self): + self.frame['hi'] = 'there' + + frame = self.frame.copy() + frame.columns = ['foo', 'bar', 'baz', 'quux', 'foo2'] + assert_series_equal(self.frame['C'], frame['baz']) + assert_series_equal(self.frame['hi'], frame['foo2']) + + def test_cast_internals(self): + casted = DataFrame(self.frame._data, dtype=int) + expected = DataFrame(self.frame._series, dtype=int) + assert_frame_equal(casted, expected) + + def test_consolidate(self): + self.frame['E'] = 7. + consolidated = self.frame.consolidate() + self.assert_(len(consolidated._data.blocks) == 1) + + # Ensure copy, do I want this? + recons = consolidated.consolidate() + self.assert_(recons is not consolidated) + assert_frame_equal(recons, consolidated) + + self.frame['F'] = 8. + self.assert_(len(self.frame._data.blocks) == 3) + self.frame.consolidate(inplace=True) + self.assert_(len(self.frame._data.blocks) == 1) + + def test_consolidate_inplace(self): + frame = self.frame.copy() + + # triggers in-place consolidation + for letter in range(ord('A'), ord('Z')): + self.frame[chr(letter)] = chr(letter) + + def test_as_matrix_consolidate(self): + self.frame['E'] = 7. + self.assert_(not self.frame._data.is_consolidated()) + _ = self.frame.as_matrix() + self.assert_(self.frame._data.is_consolidated()) + + def test_modify_values(self): + self.frame.values[5] = 5 + self.assert_((self.frame.values[5] == 5).all()) + + # unconsolidated + self.frame['E'] = 7. + self.frame.values[6] = 6 + self.assert_((self.frame.values[6] == 6).all()) + + def test_boolean_set_uncons(self): + self.frame['E'] = 7. + + expected = self.frame.values.copy() + expected[expected > 1] = 2 + + self.frame[self.frame > 1] = 2 + assert_almost_equal(expected, self.frame.values) + + def test_boolean_set_mixed_type(self): + bools = self.mixed_frame.applymap(lambda x: x != 2).astype(bool) + self.assertRaises(Exception, self.mixed_frame.__setitem__, bools, 2) + + def test_xs_view(self): + dm = DataFrame(np.arange(20.).reshape(4, 5), + index=range(4), columns=range(5)) + + dm.xs(2, copy=False)[:] = 5 + self.assert_((dm.xs(2) == 5).all()) + + dm.xs(2)[:] = 10 + self.assert_((dm.xs(2) == 5).all()) + + # TODO (?): deal with mixed-type fiasco? + self.assertRaises(Exception, self.mixed_frame.xs, + self.mixed_frame.index[2], copy=False) + + # unconsolidated + dm['foo'] = 6. + dm.xs(3, copy=False)[:] = 10 + self.assert_((dm.xs(3) == 10).all()) + + def test_boolean_indexing(self): + idx = range(3) + cols = range(3) + df1 = DataFrame(index=idx, columns=cols, \ + data=np.array([[0.0, 0.5, 1.0], + [1.5, 2.0, 2.5], + [3.0, 3.5, 4.0]], dtype=float)) + df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols)))) + + expected = DataFrame(index=idx, columns=cols, \ + data=np.array([[0.0, 0.5, 1.0], + [1.5, 2.0, -1], + [-1, -1, -1]], dtype=float)) + + df1[df1 > 2.0 * df2] = -1 + assert_frame_equal(df1, expected) + + def test_sum_bools(self): + df = DataFrame(index=range(1), columns=range(10)) + bools = np.isnan(df) + self.assert_(bools.sum(axis=1)[0] == 10) + + def test_fillna_col_reordering(self): + idx = range(20) + cols = ["COL." + str(i) for i in range(5, 0, -1)] + data = np.random.rand(20, 5) + df = DataFrame(index=range(20), columns=cols, data=data) + self.assert_(df.columns.tolist() == df.fillna().columns.tolist()) + + def test_take(self): + # homogeneous + #---------------------------------------- + + # mixed-dtype + #---------------------------------------- + order = [4, 1, 2, 0, 3] + + result = self.mixed_frame.take(order, axis=0) + expected = self.mixed_frame.reindex(self.mixed_frame.index.take(order)) + assert_frame_equal(result, expected) + + # axis = 1 + result = self.mixed_frame.take(order, axis=1) + expected = self.mixed_frame.ix[:, ['foo', 'B', 'C', 'A', 'D']] + assert_frame_equal(result, expected) + + def test_iterkv_names(self): + for k, v in self.mixed_frame.iterkv(): + self.assertEqual(v.name, k) + + def test_series_put_names(self): + series = self.mixed_frame._series + for k, v in series.iteritems(): + self.assertEqual(v.name, k) + + def test_dot(self): + a = DataFrame(np.random.randn(3, 4), index=['a', 'b', 'c'], + columns=['p', 'q', 'r', 's']) + b = DataFrame(np.random.randn(4, 2), index=['p', 'q', 'r', 's'], + columns=['one', 'two']) + + result = a.dot(b) + expected = DataFrame(np.dot(a.values, b.values), + index=['a', 'b', 'c'], + columns=['one', 'two']) + assert_frame_equal(result, expected) + + def test_idxmin(self): + frame = self.frame + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + for skipna in [True, False]: + for axis in [0, 1]: + for df in [frame, self.intframe]: + result = df.idxmin(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmin, axis=axis, skipna=skipna) + assert_series_equal(result, expected) + + self.assertRaises(Exception, frame.idxmin, axis=2) + + def test_idxmax(self): + frame = self.frame + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + for skipna in [True, False]: + for axis in [0, 1]: + for df in [frame, self.intframe]: + result = df.idxmax(axis=axis, skipna=skipna) + expected = df.apply(Series.idxmax, axis=axis, skipna=skipna) + assert_series_equal(result, expected) + + self.assertRaises(Exception, frame.idxmax, axis=2) + + def test_stale_cached_series_bug_473(self): + Y = DataFrame(np.random.random((4, 4)), index=('a', 'b','c','d'), + columns=('e','f','g','h')) + repr(Y) + Y['e'] = Y['e'].astype('object') + Y['g']['c'] = np.NaN + repr(Y) + result = Y.sum() + exp = Y['g'].sum() + self.assert_(isnull(Y['g']['c'])) + + def test_index_namedtuple(self): + try: + from collections import namedtuple + except ImportError: + raise nose.SkipTest + IndexType = namedtuple("IndexType", ["a", "b"]) + idx1 = IndexType("foo", "bar") + idx2 = IndexType("baz", "bof") + index = Index([idx1, idx2], name="composite_index") + df = DataFrame([(1, 2), (3, 4)], index=index, columns=["A", "B"]) + self.assertEqual(df.ix[IndexType("foo", "bar")]["A"], 1) + + def test_bool_raises_value_error_1069(self): + df = DataFrame([1, 2, 3]) + self.failUnlessRaises(ValueError, lambda: bool(df)) + + def test_any_all(self): + self._check_bool_op('any', np.any, has_skipna=True, has_bool_only=True) + self._check_bool_op('all', np.all, has_skipna=True, has_bool_only=True) + + def test_consolidate_datetime64(self): + # numpy vstack bug + + data = """\ +starting,ending,measure +2012-06-21 00:00,2012-06-23 07:00,77 +2012-06-23 07:00,2012-06-23 16:30,65 +2012-06-23 16:30,2012-06-25 08:00,77 +2012-06-25 08:00,2012-06-26 12:00,0 +2012-06-26 12:00,2012-06-27 08:00,77 +""" + df = read_csv(StringIO(data), parse_dates=[0,1]) + + ser_starting = df.starting + ser_starting.index = ser_starting.values + ser_starting = ser_starting.tz_localize('US/Eastern') + ser_starting = ser_starting.tz_convert('UTC') + + ser_ending = df.ending + ser_ending.index = ser_ending.values + ser_ending = ser_ending.tz_localize('US/Eastern') + ser_ending = ser_ending.tz_convert('UTC') + + df.starting = ser_starting.index + df.ending = ser_ending.index + + assert_array_equal(df.starting.values, ser_starting.index.values) + assert_array_equal(df.ending.values, ser_ending.index.values) + + def _check_bool_op(self, name, alternative, frame=None, has_skipna=True, + has_bool_only=False): + if frame is None: + frame = self.frame > 0 + # set some NAs + frame = DataFrame(frame.values.astype(object), frame.index, + frame.columns) + frame.ix[5:10] = np.nan + frame.ix[15:20, -2:] = np.nan + + f = getattr(frame, name) + + if has_skipna: + def skipna_wrapper(x): + nona = x.dropna().values + return alternative(nona) + + def wrapper(x): + return alternative(x.values) + + result0 = f(axis=0, skipna=False) + result1 = f(axis=1, skipna=False) + assert_series_equal(result0, frame.apply(wrapper)) + assert_series_equal(result1, frame.apply(wrapper, axis=1), + check_dtype=False) # HACK: win32 + else: + skipna_wrapper = alternative + wrapper = alternative + + result0 = f(axis=0) + result1 = f(axis=1) + assert_series_equal(result0, frame.apply(skipna_wrapper)) + assert_series_equal(result1, frame.apply(skipna_wrapper, axis=1), + check_dtype=False) + + # result = f(axis=1) + # comp = frame.apply(alternative, axis=1).reindex(result.index) + # assert_series_equal(result, comp) + + self.assertRaises(Exception, f, axis=2) + + # make sure works on mixed-type frame + mixed = self.mixed_frame + mixed['_bool_'] = np.random.randn(len(mixed)) > 0 + getattr(mixed, name)(axis=0) + getattr(mixed, name)(axis=1) + + class NonzeroFail: + + def __nonzero__(self): + raise ValueError + + mixed['_nonzero_fail_'] = NonzeroFail() + + if has_bool_only: + getattr(mixed, name)(axis=0, bool_only=True) + getattr(mixed, name)(axis=1, bool_only=True) + getattr(frame, name)(axis=0, bool_only=False) + getattr(frame, name)(axis=1, bool_only=False) + + # all NA case + if has_skipna: + all_na = frame * np.NaN + r0 = getattr(all_na, name)(axis=0) + r1 = getattr(all_na, name)(axis=1) + if name == 'any': + self.assert_(not r0.any()) + self.assert_(not r1.any()) + else: + self.assert_(r0.all()) + self.assert_(r1.all()) + +if __name__ == '__main__': + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--ipdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py new file mode 100644 index 00000000..34ef3c9a --- /dev/null +++ b/pandas/tests/test_graphics.py @@ -0,0 +1,395 @@ +import nose +import os +import string +import unittest + +from datetime import datetime + +from pandas import Series, DataFrame, MultiIndex, PeriodIndex, date_range +import pandas.util.testing as tm + +import numpy as np + +from numpy.testing.decorators import slow +import pandas.tools.plotting as plotting + +class TestSeriesPlots(unittest.TestCase): + + @classmethod + def setUpClass(cls): + import sys + if 'IPython' in sys.modules: + raise nose.SkipTest + + try: + import matplotlib as mpl + mpl.use('Agg', warn=False) + except ImportError: + raise nose.SkipTest + + def setUp(self): + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.iseries = tm.makePeriodSeries() + self.iseries.name = 'iseries' + + @slow + def test_plot(self): + _check_plot_works(self.ts.plot, label='foo') + _check_plot_works(self.ts.plot, use_index=False) + _check_plot_works(self.ts.plot, rot=0) + _check_plot_works(self.ts.plot, style='.', logy=True) + _check_plot_works(self.ts.plot, style='.', logx=True) + _check_plot_works(self.ts.plot, style='.', loglog=True) + _check_plot_works(self.ts[:10].plot, kind='bar') + _check_plot_works(self.series[:5].plot, kind='bar') + _check_plot_works(self.series[:5].plot, kind='line') + _check_plot_works(self.series[:5].plot, kind='barh') + _check_plot_works(self.series[:10].plot, kind='barh') + + Series(np.random.randn(10)).plot(kind='bar',color='black') + + @slow + def test_bar_colors(self): + import matplotlib.pyplot as plt + import matplotlib.colors as colors + + default_colors = 'brgyk' + custom_colors = 'rgcby' + + plt.close('all') + df = DataFrame(np.random.randn(5, 5)) + ax = df.plot(kind='bar') + + rects = ax.patches + + conv = colors.colorConverter + for i, rect in enumerate(rects[:5]): + xp = conv.to_rgba(default_colors[i]) + rs = rect.get_facecolor() + self.assert_(xp, rs) + + plt.close('all') + + ax = df.plot(kind='bar', color=custom_colors) + + rects = ax.patches + + conv = colors.colorConverter + for i, rect in enumerate(rects[:5]): + xp = conv.to_rgba(custom_colors[i]) + rs = rect.get_facecolor() + self.assert_(xp, rs) + + + @slow + def test_irregular_datetime(self): + rng = date_range('1/1/2000', '3/1/2000') + rng = rng[[0,1,2,3,5,9,10,11,12]] + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xp = datetime(1999, 1, 1).toordinal() + ax.set_xlim('1/1/1999', '1/1/2001') + self.assert_(xp == ax.get_xlim()[0]) + + @slow + def test_hist(self): + _check_plot_works(self.ts.hist) + _check_plot_works(self.ts.hist, grid=False) + + @slow + def test_kde(self): + _check_plot_works(self.ts.plot, kind='kde') + _check_plot_works(self.ts.plot, kind='density') + ax = self.ts.plot(kind='kde', logy=True) + self.assert_(ax.get_yscale() == 'log') + + @slow + def test_autocorrelation_plot(self): + from pandas.tools.plotting import autocorrelation_plot + _check_plot_works(autocorrelation_plot, self.ts) + _check_plot_works(autocorrelation_plot, self.ts.values) + + @slow + def test_lag_plot(self): + from pandas.tools.plotting import lag_plot + _check_plot_works(lag_plot, self.ts) + +class TestDataFramePlots(unittest.TestCase): + + @classmethod + def setUpClass(cls): + import sys + if 'IPython' in sys.modules: + raise nose.SkipTest + + try: + import matplotlib as mpl + mpl.use('Agg', warn=False) + except ImportError: + raise nose.SkipTest + + @slow + def test_plot(self): + df = tm.makeTimeDataFrame() + _check_plot_works(df.plot, grid=False) + _check_plot_works(df.plot, subplots=True) + _check_plot_works(df.plot, subplots=True, use_index=False) + + df = DataFrame({'x':[1,2], 'y':[3,4]}) + self._check_plot_fails(df.plot, kind='line', blarg=True) + + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + _check_plot_works(df.plot, use_index=True) + _check_plot_works(df.plot, sort_columns=False) + _check_plot_works(df.plot, yticks=[1, 5, 10]) + _check_plot_works(df.plot, xticks=[1, 5, 10]) + _check_plot_works(df.plot, ylim=(-100, 100), xlim=(-100, 100)) + _check_plot_works(df.plot, subplots=True, title='blah') + _check_plot_works(df.plot, title='blah') + + tuples = zip(list(string.ascii_letters[:10]), range(10)) + df = DataFrame(np.random.rand(10, 3), + index=MultiIndex.from_tuples(tuples)) + _check_plot_works(df.plot, use_index=True) + + @slow + def test_subplots(self): + df = DataFrame(np.random.rand(10, 3), + index=list(string.ascii_letters[:10])) + + axes = df.plot(subplots=True, sharex=True, legend=True) + + for ax in axes: + self.assert_(ax.get_legend() is not None) + + axes = df.plot(subplots=True, sharex=True) + for ax in axes[:-2]: + [self.assert_(not label.get_visible()) + for label in ax.get_xticklabels()] + [self.assert_(label.get_visible()) + for label in ax.get_yticklabels()] + + [self.assert_(label.get_visible()) + for label in axes[-1].get_xticklabels()] + [self.assert_(label.get_visible()) + for label in axes[-1].get_yticklabels()] + + axes = df.plot(subplots=True, sharex=False) + for ax in axes: + [self.assert_(label.get_visible()) + for label in ax.get_xticklabels()] + [self.assert_(label.get_visible()) + for label in ax.get_yticklabels()] + + @slow + def test_plot_bar(self): + df = DataFrame(np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + + _check_plot_works(df.plot, kind='bar') + _check_plot_works(df.plot, kind='bar', legend=False) + _check_plot_works(df.plot, kind='bar', subplots=True) + _check_plot_works(df.plot, kind='bar', stacked=True) + + df = DataFrame(np.random.randn(10, 15), + index=list(string.ascii_letters[:10]), + columns=range(15)) + _check_plot_works(df.plot, kind='bar') + + df = DataFrame({'a': [0, 1], 'b': [1, 0]}) + _check_plot_works(df.plot, kind='bar') + + @slow + def test_boxplot(self): + df = DataFrame(np.random.randn(6, 4), + index=list(string.ascii_letters[:6]), + columns=['one', 'two', 'three', 'four']) + df['indic'] = ['foo', 'bar'] * 3 + df['indic2'] = ['foo', 'bar', 'foo'] * 2 + + _check_plot_works(df.boxplot) + _check_plot_works(df.boxplot, column=['one', 'two']) + _check_plot_works(df.boxplot, column=['one', 'two'], + by='indic') + _check_plot_works(df.boxplot, column='one', by=['indic', 'indic2']) + _check_plot_works(df.boxplot, by='indic') + _check_plot_works(df.boxplot, by=['indic', 'indic2']) + + _check_plot_works(lambda x: plotting.boxplot(x), df['one']) + + _check_plot_works(df.boxplot, notch=1) + _check_plot_works(df.boxplot, by='indic', notch=1) + + df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + _check_plot_works(df.boxplot, by='X') + + @slow + def test_kde(self): + df = DataFrame(np.random.randn(100, 4)) + _check_plot_works(df.plot, kind='kde') + _check_plot_works(df.plot, kind='kde', subplots=True) + axes = df.plot(kind='kde', logy=True, subplots=True) + for ax in axes: + self.assert_(ax.get_yscale() == 'log') + + @slow + def test_hist(self): + df = DataFrame(np.random.randn(100, 4)) + _check_plot_works(df.hist) + _check_plot_works(df.hist, grid=False) + + #make sure layout is handled + df = DataFrame(np.random.randn(100, 3)) + _check_plot_works(df.hist) + axes = df.hist(grid=False) + self.assert_(not axes[1, 1].get_visible()) + + df = DataFrame(np.random.randn(100, 1)) + _check_plot_works(df.hist) + + #make sure layout is handled + df = DataFrame(np.random.randn(100, 6)) + _check_plot_works(df.hist) + + #make sure sharex, sharey is handled + _check_plot_works(df.hist, sharex=True, sharey=True) + + #make sure kwargs are handled + ser = df[0] + xf, yf = 20, 20 + xrot, yrot = 30, 30 + ax = ser.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) + ytick = ax.get_yticklabels()[0] + xtick = ax.get_xticklabels()[0] + self.assertAlmostEqual(ytick.get_fontsize(), yf) + self.assertAlmostEqual(ytick.get_rotation(), yrot) + self.assertAlmostEqual(xtick.get_fontsize(), xf) + self.assertAlmostEqual(xtick.get_rotation(), xrot) + + xf, yf = 20, 20 + xrot, yrot = 30, 30 + axes = df.hist(xlabelsize=xf, xrot=30, ylabelsize=yf, yrot=30) + for i, ax in enumerate(axes.ravel()): + if i < len(df.columns): + ytick = ax.get_yticklabels()[0] + xtick = ax.get_xticklabels()[0] + self.assertAlmostEqual(ytick.get_fontsize(), yf) + self.assertAlmostEqual(ytick.get_rotation(), yrot) + self.assertAlmostEqual(xtick.get_fontsize(), xf) + self.assertAlmostEqual(xtick.get_rotation(), xrot) + + @slow + def test_scatter(self): + df = DataFrame(np.random.randn(100, 4)) + import pandas.tools.plotting as plt + def scat(**kwds): + return plt.scatter_matrix(df, **kwds) + _check_plot_works(scat) + _check_plot_works(scat, marker='+') + _check_plot_works(scat, vmin=0) + _check_plot_works(scat, diagonal='kde') + _check_plot_works(scat, diagonal='density') + _check_plot_works(scat, diagonal='hist') + + def scat2(x, y, by=None, ax=None, figsize=None): + return plt.scatter_plot(df, x, y, by, ax, figsize=None) + + _check_plot_works(scat2, 0, 1) + grouper = Series(np.repeat([1, 2, 3, 4, 5], 20), df.index) + _check_plot_works(scat2, 0, 1, by=grouper) + + @slow + def test_andrews_curves(self): + from pandas import read_csv + from pandas.tools.plotting import andrews_curves + path = os.path.join(curpath(), 'data/iris.csv') + df = read_csv(path) + _check_plot_works(andrews_curves, df, 'Name') + + @slow + def test_plot_int_columns(self): + df = DataFrame(np.random.randn(100, 4)).cumsum() + _check_plot_works(df.plot, legend=True) + + @slow + def test_legend_name(self): + multi = DataFrame(np.random.randn(4, 4), + columns=[np.array(['a', 'a', 'b', 'b']), + np.array(['x', 'y', 'x', 'y'])]) + multi.columns.names = ['group', 'individual'] + + ax = multi.plot() + leg_title = ax.legend_.get_title() + self.assert_(leg_title.get_text(), 'group,individual') + + def _check_plot_fails(self, f, *args, **kwargs): + self.assertRaises(Exception, f, *args, **kwargs) + +class TestDataFrameGroupByPlots(unittest.TestCase): + + @classmethod + def setUpClass(cls): + import sys + if 'IPython' in sys.modules: + raise nose.SkipTest + + try: + import matplotlib as mpl + mpl.use('Agg', warn=False) + except ImportError: + raise nose.SkipTest + + @slow + def test_boxplot(self): + df = DataFrame(np.random.rand(10,2), columns=['Col1', 'Col2'] ) + df['X'] = Series(['A','A','A','A','A','B','B','B','B','B']) + grouped = df.groupby(by='X') + _check_plot_works(grouped.boxplot) + _check_plot_works(grouped.boxplot, subplots=False) + + tuples = zip(list(string.ascii_letters[:10]), range(10)) + df = DataFrame(np.random.rand(10, 3), + index=MultiIndex.from_tuples(tuples)) + grouped = df.groupby(level=1) + _check_plot_works(grouped.boxplot) + _check_plot_works(grouped.boxplot, subplots=False) + grouped = df.unstack(level=1).groupby(level=0, axis=1) + _check_plot_works(grouped.boxplot) + _check_plot_works(grouped.boxplot, subplots=False) + +PNG_PATH = 'tmp.png' + +def _check_plot_works(f, *args, **kwargs): + import matplotlib.pyplot as plt + + fig = plt.gcf() + plt.clf() + ax = fig.add_subplot(211) + ret = f(*args, **kwargs) + assert(ret is not None) # do something more intelligent + + ax = fig.add_subplot(212) + try: + kwargs['ax'] = ax + ret = f(*args, **kwargs) + assert(ret is not None) # do something more intelligent + except Exception: + pass + plt.savefig(PNG_PATH) + os.remove(PNG_PATH) + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py new file mode 100644 index 00000000..eb4fc3ff --- /dev/null +++ b/pandas/tests/test_groupby.py @@ -0,0 +1,2051 @@ +import nose +import unittest + +from datetime import datetime +from numpy import nan + +from pandas import bdate_range +from pandas.core.index import Index, MultiIndex +from pandas.core.common import rands +from pandas.core.api import Categorical, DataFrame +from pandas.core.groupby import GroupByError +from pandas.core.series import Series +from pandas.util.testing import (assert_panel_equal, assert_frame_equal, + assert_series_equal, assert_almost_equal) +from pandas.core.panel import Panel +from pandas.tools.merge import concat +from collections import defaultdict +import pandas.core.common as com +import pandas.core.datetools as dt +import numpy as np +from numpy.testing import assert_equal + +import pandas.util.testing as tm + +def commonSetUp(self): + self.dateRange = bdate_range('1/1/2005', periods=250) + self.stringIndex = Index([rands(8).upper() for x in xrange(250)]) + + self.groupId = Series([x[0] for x in self.stringIndex], + index=self.stringIndex) + self.groupDict = dict((k, v) for k, v in self.groupId.iteritems()) + + self.columnIndex = Index(['A', 'B', 'C', 'D', 'E']) + + randMat = np.random.randn(250, 5) + self.stringMatrix = DataFrame(randMat, columns=self.columnIndex, + index=self.stringIndex) + + self.timeMatrix = DataFrame(randMat, columns=self.columnIndex, + index=self.dateRange) + +class TestGroupBy(unittest.TestCase): + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + def test_basic(self): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + for k, v in grouped: + self.assertEqual(len(v), 3) + + agged = grouped.aggregate(np.mean) + self.assertEqual(agged[1], 1) + + assert_series_equal(agged, grouped.agg(np.mean)) # shorthand + assert_series_equal(agged, grouped.mean()) + + # Cython only returning floating point for now... + assert_series_equal(grouped.agg(np.sum).astype(float), + grouped.sum()) + + transformed = grouped.transform(lambda x: x * x.sum()) + self.assertEqual(transformed[7], 12) + + value_grouped = data.groupby(data) + assert_series_equal(value_grouped.aggregate(np.mean), agged) + + # complex agg + agged = grouped.aggregate([np.mean, np.std]) + agged = grouped.aggregate({'one' : np.mean, + 'two' : np.std}) + + group_constants = { + 0 : 10, + 1 : 20, + 2 : 30 + } + agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) + self.assertEqual(agged[1], 21) + + # corner cases + self.assertRaises(Exception, grouped.aggregate, lambda x: x * 2) + + def test_first_last_nth(self): + # tests for first / last / nth + grouped = self.df.groupby('A') + first = grouped.first() + expected = self.df.ix[[1, 0], ['C', 'D']] + expected.index = ['bar', 'foo'] + assert_frame_equal(first, expected) + + last = grouped.last() + expected = self.df.ix[[5, 7], ['C', 'D']] + expected.index = ['bar', 'foo'] + assert_frame_equal(last, expected) + + nth = grouped.nth(1) + expected = self.df.ix[[3, 2], ['B', 'C', 'D']] + expected.index = ['bar', 'foo'] + assert_frame_equal(nth, expected) + + # it works! + grouped['B'].first() + grouped['B'].last() + grouped['B'].nth(0) + + self.df['B'][self.df['A'] == 'foo'] = np.nan + self.assert_(com.isnull(grouped['B'].first()['foo'])) + self.assert_(com.isnull(grouped['B'].last()['foo'])) + self.assert_(com.isnull(grouped['B'].nth(0)['foo'])) + + def test_grouper_iter(self): + self.assertEqual(sorted(self.df.groupby('A').grouper), ['bar', 'foo']) + + def test_empty_groups(self): + # GH # 1048 + self.assertRaises(ValueError, self.df.groupby, []) + + def test_groupby_grouper(self): + grouped = self.df.groupby('A') + + result = self.df.groupby(grouped.grouper).mean() + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_groupby_dict_mapping(self): + # GH #679 + from pandas import Series + s = Series({'T1': 5}) + result = s.groupby({'T1': 'T2'}).agg(sum) + expected = s.groupby(['T2']).agg(sum) + assert_series_equal(result, expected) + + s = Series([1., 2., 3., 4.], index=list('abcd')) + mapping = {'a' : 0, 'b' : 0, 'c' : 1, 'd' : 1} + + result = s.groupby(mapping).mean() + result2 = s.groupby(mapping).agg(np.mean) + expected = s.groupby([0, 0, 1, 1]).mean() + expected2 = s.groupby([0, 0, 1, 1]).mean() + assert_series_equal(result, expected) + assert_series_equal(result, result2) + assert_series_equal(result, expected2) + + def test_groupby_nonobject_dtype(self): + key = self.mframe.index.labels[0] + grouped = self.mframe.groupby(key) + result = grouped.sum() + + expected = self.mframe.groupby(key.astype('O')).sum() + assert_frame_equal(result, expected) + + def test_agg_regression1(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_agg_datetimes_mixed(self): + data = [[1, '2012-01-01', 1.0], + [2, '2012-01-02', 2.0], + [3, None, 3.0]] + + df1 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() + if row[1] else None, row[2]] for row in data] + + df2 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + df1['weights'] = df1['value']/df1['value'].sum() + gb1 = df1.groupby('date').aggregate(np.sum) + + df2['weights'] = df1['value']/df1['value'].sum() + gb2 = df2.groupby('date').aggregate(np.sum) + + assert(len(gb1) == len(gb2)) + + def test_agg_must_agg(self): + grouped = self.df.groupby('A')['C'] + self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) + self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) + + def test_agg_ser_multi_key(self): + ser = self.df.C + f = lambda x: x.sum() + results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) + expected = self.df.groupby(['A', 'B']).sum()['C'] + assert_series_equal(results, expected) + + def test_get_group(self): + wp = tm.makePanel() + grouped = wp.groupby(lambda x: x.month, axis='major') + + gp = grouped.get_group(1) + expected = wp.reindex(major=[x for x in wp.major_axis if x.month == 1]) + assert_panel_equal(gp, expected) + + def test_agg_apply_corner(self): + # nothing to group, all NA + grouped = self.ts.groupby(self.ts * np.nan) + + assert_series_equal(grouped.sum(), Series([])) + assert_series_equal(grouped.agg(np.sum), Series([])) + assert_series_equal(grouped.apply(np.sum), Series([])) + + # DataFrame + grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) + assert_frame_equal(grouped.sum(), + DataFrame(columns=self.tsframe.columns)) + assert_frame_equal(grouped.agg(np.sum), + DataFrame(columns=self.tsframe.columns)) + assert_frame_equal(grouped.apply(np.sum), DataFrame({})) + + def test_agg_grouping_is_list_tuple(self): + from pandas.core.groupby import Grouping + + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_agg_python_multiindex(self): + grouped = self.mframe.groupby(['A', 'B']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_apply_describe_bug(self): + grouped = self.mframe.groupby(level='first') + result = grouped.describe() # it works! + + def test_len(self): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day]) + self.assertEquals(len(grouped), len(df)) + + grouped = df.groupby([lambda x: x.year, + lambda x: x.month]) + expected = len(set([(x.year, x.month) for x in df.index])) + self.assertEquals(len(grouped), expected) + + def test_groups(self): + grouped = self.df.groupby(['A']) + groups = grouped.groups + self.assert_(groups is grouped.groups) # caching works + + for k, v in grouped.groups.iteritems(): + self.assert_((self.df.ix[v]['A'] == k).all()) + + grouped = self.df.groupby(['A', 'B']) + groups = grouped.groups + self.assert_(groups is grouped.groups) # caching works + for k, v in grouped.groups.iteritems(): + self.assert_((self.df.ix[v]['A'] == k[0]).all()) + self.assert_((self.df.ix[v]['B'] == k[1]).all()) + + def test_aggregate_str_func(self): + def _check_results(grouped): + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg({'A' : 'var', 'B' : 'std', 'C' : 'mean'}) + expected = DataFrame({'A' : grouped['A'].var(), + 'B' : grouped['B'].std(), + 'C' : grouped['C'].mean()}) + assert_frame_equal(result, expected) + + by_weekday = self.tsframe.groupby(lambda x: x.weekday()) + _check_results(by_weekday) + + by_mwkday = self.tsframe.groupby([lambda x: x.month, + lambda x: x.weekday()]) + _check_results(by_mwkday) + + def test_aggregate_item_by_item(self): + + df = self.df.copy() + df['E'] = ['a'] * len(self.df) + grouped = self.df.groupby('A') + def aggfun(ser): + return len(ser + 'a') + result = grouped.agg(aggfun) + self.assertEqual(len(result.columns), 1) + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (self.df.A == 'foo').sum() + bar = (self.df.A == 'bar').sum() + self.assert_((result.xs('foo') == foo).all()) + self.assert_((result.xs('bar') == bar).all()) + + def aggfun(ser): + return ser.size + result = DataFrame().groupby(self.df.A).agg(aggfun) + self.assert_(isinstance(result, DataFrame)) + self.assertEqual(len(result), 0) + + def test_basic_regression(self): + # regression + T = [1.0*x for x in range(1,10) *10][:1095] + result = Series(T, range(0, len(T))) + + groupings = np.random.random((1100,)) + groupings = Series(groupings, range(0, len(groupings))) * 10. + + grouped = result.groupby(groupings) + grouped.mean() + + def test_transform(self): + data = Series(np.arange(9) // 3, index=np.arange(9)) + + index = np.arange(9) + np.random.shuffle(index) + data = data.reindex(index) + + grouped = data.groupby(lambda x: x // 3) + + transformed = grouped.transform(lambda x: x * x.sum()) + self.assertEqual(transformed[7], 12) + + def test_transform_broadcast(self): + grouped = self.ts.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + + self.assert_(result.index.equals(self.ts.index)) + for _, gp in grouped: + self.assert_((result.reindex(gp.index) == gp.mean()).all()) + + grouped = self.tsframe.groupby(lambda x: x.month) + result = grouped.transform(np.mean) + self.assert_(result.index.equals(self.tsframe.index)) + for _, gp in grouped: + agged = gp.mean() + res = result.reindex(gp.index) + for col in self.tsframe: + self.assert_((res[col] == agged[col]).all()) + + # group columns + grouped = self.tsframe.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1}, + axis=1) + result = grouped.transform(np.mean) + self.assert_(result.index.equals(self.tsframe.index)) + self.assert_(result.columns.equals(self.tsframe.columns)) + for _, gp in grouped: + agged = gp.mean(1) + res = result.reindex(columns=gp.columns) + for idx in gp.index: + self.assert_((res.xs(idx) == agged[idx]).all()) + + def test_transform_multiple(self): + grouped = self.ts.groupby([lambda x: x.year, lambda x: x.month]) + + transformed = grouped.transform(lambda x: x * 2) + broadcasted = grouped.transform(np.mean) + + def test_dispatch_transform(self): + df = self.tsframe[::5].reindex(self.tsframe.index) + + grouped = df.groupby(lambda x: x.month) + + filled = grouped.fillna(method='pad') + fillit = lambda x: x.fillna(method='pad') + expected = df.groupby(lambda x: x.month).transform(fillit) + assert_frame_equal(filled, expected) + + def test_transform_select_columns(self): + f = lambda x: x.mean() + result = self.df.groupby('A')['C', 'D'].transform(f) + + selection = self.df[['C', 'D']] + expected = selection.groupby(self.df['A']).transform(f) + + assert_frame_equal(result, expected) + + def test_transform_exclude_nuisance(self): + expected = {} + grouped = self.df.groupby('A') + expected['C'] = grouped['C'].transform(np.mean) + expected['D'] = grouped['D'].transform(np.mean) + expected = DataFrame(expected) + + result = self.df.groupby('A').transform(np.mean) + + assert_frame_equal(result, expected) + + def test_transform_function_aliases(self): + result = self.df.groupby('A').transform('mean') + expected = self.df.groupby('A').transform(np.mean) + assert_frame_equal(result, expected) + + result = self.df.groupby('A')['C'].transform('mean') + expected = self.df.groupby('A')['C'].transform(np.mean) + assert_series_equal(result, expected) + + def test_with_na(self): + index = Index(np.arange(10)) + values = Series(np.ones(10), index) + labels = Series([nan, 'foo', 'bar', 'bar', nan, nan, 'bar', + 'bar', nan, 'foo'], index=index) + + grouped = values.groupby(labels) + agged = grouped.agg(len) + expected = Series([4, 2], index=['bar', 'foo']) + + assert_series_equal(agged, expected, check_dtype=False) + self.assert_(issubclass(agged.dtype.type, np.integer)) + + def test_attr_wrapper(self): + grouped = self.ts.groupby(lambda x: x.weekday()) + + result = grouped.std() + expected = grouped.agg(lambda x: np.std(x, ddof=1)) + assert_series_equal(result, expected) + + # this is pretty cool + result = grouped.describe() + expected = {} + for name, gp in grouped: + expected[name] = gp.describe() + expected = DataFrame(expected).T + assert_frame_equal(result.unstack(), expected) + + # get attribute + result = grouped.dtype + expected = grouped.agg(lambda x: x.dtype) + + # make sure raises error + self.assertRaises(AttributeError, getattr, grouped, 'foo') + + def test_series_describe_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.describe().unstack() + assert_series_equal(result['mean'], grouped.mean()) + assert_series_equal(result['std'], grouped.std()) + assert_series_equal(result['min'], grouped.min()) + + def test_series_describe_single(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x.describe()) + expected = grouped.describe() + assert_series_equal(result, expected) + + def test_series_agg_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + + def test_series_agg_multi_pure_python(self): + data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + def bad(x): + assert(len(x.base) > 0) + return 'foo' + + result = data.groupby(['A', 'B']).agg(bad) + expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + assert_frame_equal(result, expected) + + def test_series_index_name(self): + grouped = self.df.ix[:, ['C']].groupby(self.df['A']) + result = grouped.agg(lambda x: x.mean()) + self.assertEqual(result.index.name, 'A') + + def test_frame_describe_multikey(self): + grouped = self.tsframe.groupby([lambda x: x.year, + lambda x: x.month]) + result = grouped.describe() + + for col in self.tsframe: + expected = grouped[col].describe() + assert_series_equal(result[col], expected) + + groupedT = self.tsframe.groupby({'A' : 0, 'B' : 0, + 'C' : 1, 'D' : 1}, axis=1) + result = groupedT.describe() + + for name, group in groupedT: + assert_frame_equal(result[name], group.describe()) + + def test_frame_groupby(self): + grouped = self.tsframe.groupby(lambda x: x.weekday()) + + # aggregate + aggregated = grouped.aggregate(np.mean) + self.assertEqual(len(aggregated), 5) + self.assertEqual(len(aggregated.columns), 4) + + # by string + tscopy = self.tsframe.copy() + tscopy['weekday'] = [x.weekday() for x in tscopy.index] + stragged = tscopy.groupby('weekday').aggregate(np.mean) + assert_frame_equal(stragged, aggregated) + + # transform + transformed = grouped.transform(lambda x: x - x.mean()) + self.assertEqual(len(transformed), 30) + self.assertEqual(len(transformed.columns), 4) + + # transform propagate + transformed = grouped.transform(lambda x: x.mean()) + for name, group in grouped: + mean = group.mean() + for idx in group.index: + assert_almost_equal(transformed.xs(idx), mean) + + # iterate + for weekday, group in grouped: + self.assert_(group.index[0].weekday() == weekday) + + # groups / group_indices + groups = grouped.groups + indices = grouped.indices + + for k, v in groups.iteritems(): + samething = self.tsframe.index.take(indices[k]) + self.assert_((samething == v).all()) + + def test_grouping_is_iterable(self): + # this code path isn't used anywhere else + # not sure it's useful + grouped = self.tsframe.groupby([lambda x: x.weekday(), + lambda x: x.year]) + + # test it works + for g in grouped.grouper.groupings[0]: + pass + + def test_frame_groupby_columns(self): + mapping = { + 'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1 + } + grouped = self.tsframe.groupby(mapping, axis=1) + + # aggregate + aggregated = grouped.aggregate(np.mean) + self.assertEqual(len(aggregated), len(self.tsframe)) + self.assertEqual(len(aggregated.columns), 2) + + # transform + tf = lambda x: x - x.mean() + groupedT = self.tsframe.T.groupby(mapping, axis=0) + assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) + + # iterate + for k, v in grouped: + self.assertEqual(len(v.columns), 2) + + def test_frame_set_name_single(self): + grouped = self.df.groupby('A') + + result = grouped.mean() + self.assert_(result.index.name == 'A') + + result = self.df.groupby('A', as_index=False).mean() + self.assert_(result.index.name != 'A') + + result = grouped.agg(np.mean) + self.assert_(result.index.name == 'A') + + result = grouped.agg({'C' : np.mean, 'D' : np.std}) + self.assert_(result.index.name == 'A') + + result = grouped['C'].mean() + self.assert_(result.index.name == 'A') + result = grouped['C'].agg(np.mean) + self.assert_(result.index.name == 'A') + result = grouped['C'].agg([np.mean, np.std]) + self.assert_(result.index.name == 'A') + + result = grouped['C'].agg({'foo' : np.mean, 'bar' : np.std}) + self.assert_(result.index.name == 'A') + + def test_multi_iter(self): + s = Series(np.arange(6)) + k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + + grouped = s.groupby([k1, k2]) + + iterated = list(grouped) + expected = [('a', '1', s[[0, 2]]), + ('a', '2', s[[1]]), + ('b', '1', s[[4]]), + ('b', '2', s[[3, 5]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + self.assert_(e1 == one) + self.assert_(e2 == two) + assert_series_equal(three, e3) + + def test_multi_iter_frame(self): + k1 = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + k2 = np.array(['1', '2', '1', '2', '1', '2']) + df = DataFrame({'v1' : np.random.randn(6), + 'v2' : np.random.randn(6), + 'k1' : k1, 'k2' : k2}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + + grouped = df.groupby(['k1', 'k2']) + + # things get sorted! + iterated = list(grouped) + idx = df.index + expected = [('a', '1', df.ix[idx[[4]]]), + ('a', '2', df.ix[idx[[3, 5]]]), + ('b', '1', df.ix[idx[[0, 2]]]), + ('b', '2', df.ix[idx[[1]]])] + for i, ((one, two), three) in enumerate(iterated): + e1, e2, e3 = expected[i] + self.assert_(e1 == one) + self.assert_(e2 == two) + assert_frame_equal(three, e3) + + # don't iterate through groups with no data + df['k1'] = np.array(['b', 'b', 'b', 'a', 'a', 'a']) + df['k2'] = np.array(['1', '1', '1', '2', '2', '2']) + grouped = df.groupby(['k1', 'k2']) + groups = {} + for key, gp in grouped: + groups[key] = gp + self.assertEquals(len(groups), 2) + + # axis = 1 + three_levels = self.three_group.groupby(['A', 'B', 'C']).mean() + grouped = three_levels.T.groupby(axis=1, level=(1, 2)) + for key, group in grouped: + pass + + def test_multi_iter_panel(self): + wp = tm.makePanel() + grouped = wp.groupby([lambda x: x.month, lambda x: x.weekday()], + axis=1) + + for (month, wd), group in grouped: + exp_axis = [x for x in wp.major_axis + if x.month == month and x.weekday() == wd] + expected = wp.reindex(major=exp_axis) + assert_panel_equal(group, expected) + + def test_multi_func(self): + col1 = self.df['A'] + col2 = self.df['B'] + + grouped = self.df.groupby([col1.get, col2.get]) + agged = grouped.mean() + expected = self.df.groupby(['A', 'B']).mean() + assert_frame_equal(agged.ix[:, ['C', 'D']], + expected.ix[:, ['C', 'D']]) + + # some "groups" with no data + df = DataFrame({'v1' : np.random.randn(6), + 'v2' : np.random.randn(6), + 'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2' : np.array(['1', '1', '1', '2', '2', '2'])}, + index=['one', 'two', 'three', 'four', 'five', 'six']) + # only verify that it works for now + grouped = df.groupby(['k1', 'k2']) + grouped.agg(np.sum) + + def test_multi_key_multiple_functions(self): + grouped = self.df.groupby(['A', 'B'])['C'] + + agged = grouped.agg([np.mean, np.std]) + expected = DataFrame({'mean' : grouped.agg(np.mean), + 'std' : grouped.agg(np.std)}) + assert_frame_equal(agged, expected) + + def test_frame_multi_key_function_list(self): + data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + grouped = data.groupby(['A', 'B']) + funcs = [np.mean, np.std] + agged = grouped.agg(funcs) + expected = concat([grouped['D'].agg(funcs), grouped['E'].agg(funcs), + grouped['F'].agg(funcs)], + keys=['D', 'E', 'F'], axis=1) + assert(isinstance(agged.index, MultiIndex)) + assert(isinstance(expected.index, MultiIndex)) + assert_frame_equal(agged, expected) + + def test_groupby_multiple_columns(self): + data = self.df + grouped = data.groupby(['A', 'B']) + + def _check_op(op): + + result1 = op(grouped) + + expected = defaultdict(dict) + for n1, gp1 in data.groupby('A'): + for n2, gp2 in gp1.groupby('B'): + expected[n1][n2] = op(gp2.ix[:, ['C', 'D']]) + expected = dict((k, DataFrame(v)) for k, v in expected.iteritems()) + expected = Panel.fromDict(expected).swapaxes(0, 1) + + # a little bit crude + for col in ['C', 'D']: + result_col = op(grouped[col]) + exp = expected[col] + pivoted = result1[col].unstack() + pivoted2 = result_col.unstack() + assert_frame_equal(pivoted.reindex_like(exp), exp) + assert_frame_equal(pivoted2.reindex_like(exp), exp) + + _check_op(lambda x: x.sum()) + _check_op(lambda x: x.mean()) + + # test single series works the same + result = data['C'].groupby([data['A'], data['B']]).mean() + expected = data.groupby(['A', 'B']).mean()['C'] + + assert_series_equal(result, expected) + + def test_groupby_as_index_agg(self): + grouped = self.df.groupby('A', as_index=False) + + # single-key + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg({'C' : np.mean, 'D' : np.sum}) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + grouped = self.df.groupby('A', as_index=True) + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C' : 'Q'}) + result3 = grouped['C'].agg({'Q' : np.sum}) + assert_frame_equal(result3, expected3) + + # multi-key + + grouped = self.df.groupby(['A', 'B'], as_index=False) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + result2 = grouped.agg({'C' : np.mean, 'D' : np.sum}) + expected2 = grouped.mean() + expected2['D'] = grouped.sum()['D'] + assert_frame_equal(result2, expected2) + + expected3 = grouped['C'].sum() + expected3 = DataFrame(expected3).rename(columns={'C' : 'Q'}) + result3 = grouped['C'].agg({'Q' : np.sum}) + assert_frame_equal(result3, expected3) + + def test_multifunc_select_col_integer_cols(self): + df = self.df + df.columns = np.arange(len(df.columns)) + + # it works! + result = df.groupby(1, as_index=False)[2].agg({'Q' : np.mean}) + + def test_as_index_series_return_frame(self): + grouped = self.df.groupby('A', as_index=False) + grouped2 = self.df.groupby(['A', 'B'], as_index=False) + + result = grouped['C'].agg(np.sum) + expected = grouped.agg(np.sum).ix[:, ['A', 'C']] + self.assert_(isinstance(result, DataFrame)) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].agg(np.sum) + expected2 = grouped2.agg(np.sum).ix[:, ['A', 'B', 'C']] + self.assert_(isinstance(result2, DataFrame)) + assert_frame_equal(result2, expected2) + + result = grouped['C'].sum() + expected = grouped.sum().ix[:, ['A', 'C']] + self.assert_(isinstance(result, DataFrame)) + assert_frame_equal(result, expected) + + result2 = grouped2['C'].sum() + expected2 = grouped2.sum().ix[:, ['A', 'B', 'C']] + self.assert_(isinstance(result2, DataFrame)) + assert_frame_equal(result2, expected2) + + # corner case + self.assertRaises(Exception, grouped['C'].__getitem__, + 'D') + + def test_groupby_as_index_cython(self): + data = self.df + + # single-key + grouped = data.groupby('A', as_index=False) + result = grouped.mean() + expected = data.groupby(['A']).mean() + expected.insert(0, 'A', expected.index) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + # multi-key + grouped = data.groupby(['A', 'B'], as_index=False) + result = grouped.mean() + expected = data.groupby(['A', 'B']).mean() + + arrays = zip(*expected.index._tuple_index) + expected.insert(0, 'A', arrays[0]) + expected.insert(1, 'B', arrays[1]) + expected.index = np.arange(len(expected)) + assert_frame_equal(result, expected) + + def test_groupby_as_index_series_scalar(self): + grouped = self.df.groupby(['A', 'B'], as_index=False) + + # GH #421 + + result = grouped['C'].agg(len) + expected = grouped.agg(len).ix[:, ['A', 'B', 'C']] + assert_frame_equal(result, expected) + + def test_groupby_as_index_corner(self): + self.assertRaises(TypeError, self.ts.groupby, + lambda x: x.weekday(), as_index=False) + + self.assertRaises(ValueError, self.df.groupby, + lambda x: x.lower(), as_index=False, axis=1) + + def test_groupby_multiple_key(self): + df = tm.makeTimeDataFrame() + grouped = df.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day]) + agged = grouped.sum() + assert_almost_equal(df.values, agged.values) + + grouped = df.T.groupby([lambda x: x.year, + lambda x: x.month, + lambda x: x.day], axis=1) + + agged = grouped.agg(lambda x: x.sum(1)) + self.assert_(agged.index.equals(df.columns)) + assert_almost_equal(df.T.values, agged.values) + + agged = grouped.agg(lambda x: x.sum(1)) + assert_almost_equal(df.T.values, agged.values) + + def test_groupby_multi_corner(self): + # test that having an all-NA column doesn't mess you up + df = self.df.copy() + df['bad'] = np.nan + agged = df.groupby(['A', 'B']).mean() + + expected = self.df.groupby(['A', 'B']).mean() + expected['bad'] = np.nan + + assert_frame_equal(agged, expected) + + def test_omit_nuisance(self): + grouped = self.df.groupby('A') + + result = grouped.mean() + expected = self.df.ix[:, ['A', 'C', 'D']].groupby('A').mean() + assert_frame_equal(result, expected) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + df = self.df.ix[:, ['A', 'C', 'D']] + df['E'] = datetime.now() + grouped = df.groupby('A') + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_frame_equal(result, expected) + + # won't work with axis = 1 + grouped = df.groupby({'A' : 0, 'C' : 0, 'D' : 1, 'E' : 1}, axis=1) + result = self.assertRaises(TypeError, grouped.agg, + lambda x: x.sum(1, numeric_only=False)) + + def test_omit_nuisance_python_multiple(self): + grouped = self.three_group.groupby(['A', 'B']) + + agged = grouped.agg(np.mean) + exp = grouped.mean() + assert_frame_equal(agged, exp) + + def test_empty_groups_corner(self): + # handle empty groups + df = DataFrame({'k1' : np.array(['b', 'b', 'b', 'a', 'a', 'a']), + 'k2' : np.array(['1', '1', '1', '2', '2', '2']), + 'k3' : ['foo', 'bar'] * 3, + 'v1' : np.random.randn(6), + 'v2' : np.random.randn(6)}) + + grouped = df.groupby(['k1', 'k2']) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + grouped = self.mframe[3:5].groupby(level=0) + agged = grouped.apply(lambda x: x.mean()) + agged_A = grouped['A'].apply(np.mean) + assert_series_equal(agged['A'], agged_A) + self.assertEquals(agged.index.name, 'first') + + def test_apply_concat_preserve_names(self): + grouped = self.three_group.groupby(['A', 'B']) + + def desc(group): + result = group.describe() + result.index.name = 'stat' + return result + + def desc2(group): + result = group.describe() + result.index.name = 'stat' + result = result[:len(group)] + # weirdo + return result + + def desc3(group): + result = group.describe() + + # names are different + result.index.name = 'stat_%d' % len(group) + + result = result[:len(group)] + # weirdo + return result + + result = grouped.apply(desc) + self.assertEquals(result.index.names, ['A', 'B', 'stat']) + + result2 = grouped.apply(desc2) + self.assertEquals(result2.index.names, ['A', 'B', 'stat']) + + result3 = grouped.apply(desc3) + self.assertEquals(result3.index.names, ['A', 'B', None]) + + def test_nonsense_func(self): + df = DataFrame([0]) + self.assertRaises(Exception, df.groupby, lambda x: x + 'foo') + + def test_cythonized_aggers(self): + data = {'A' : [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], + 'B' : ['A', 'B'] * 6, + 'C' : np.random.randn(12)} + df = DataFrame(data) + df['C'][2:10:2] = nan + + def _testit(op): + # single column + grouped = df.drop(['B'], axis=1).groupby('A') + exp = {} + for cat, group in grouped: + exp[cat] = op(group['C']) + exp = DataFrame({'C' : exp}) + result = op(grouped) + assert_frame_equal(result, exp) + + # multiple columns + grouped = df.groupby(['A', 'B']) + expd = {} + for (cat1, cat2), group in grouped: + expd.setdefault(cat1, {})[cat2] = op(group['C']) + exp = DataFrame(expd).T.stack(dropna=False) + result = op(grouped)['C'] + assert_series_equal(result, exp) + + _testit(lambda x: x.sum()) + _testit(lambda x: x.mean()) + _testit(lambda x: x.prod()) + _testit(lambda x: x.min()) + _testit(lambda x: x.max()) + + def test_cython_agg_boolean(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': np.random.randint(0, 2, 50).astype('bool')}) + result = frame.groupby('a')['b'].mean() + expected = frame.groupby('a')['b'].agg(np.mean) + + assert_series_equal(result, expected) + + def test_cython_agg_nothing_to_agg(self): + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + self.assertRaises(GroupByError, frame.groupby('a')['b'].mean) + + frame = DataFrame({'a': np.random.randint(0, 5, 50), + 'b': ['foo', 'bar'] * 25}) + self.assertRaises(GroupByError, frame[['b']].groupby(frame['a']).mean) + + def test_wrap_aggregated_output_multindex(self): + df = self.mframe.T + df['baz', 'two'] = 'peekaboo' + + keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] + agged = df.groupby(keys).agg(np.mean) + self.assert_(isinstance(agged.columns, MultiIndex)) + + def aggfun(ser): + if ser.name == ('foo', 'one'): + raise TypeError + else: + return ser.sum() + agged2 = df.groupby(keys).aggregate(aggfun) + self.assertEqual(len(agged2.columns) + 1, len(df.columns)) + + def test_grouping_attrs(self): + deleveled = self.mframe.reset_index() + grouped = deleveled.groupby(['first', 'second']) + + for i, ping in enumerate(grouped.grouper.groupings): + the_counts = self.mframe.groupby(level=i).count()['A'] + other_counts = Series(ping.counts, ping.group_index) + assert_almost_equal(the_counts, + other_counts.reindex(the_counts.index)) + + # compute counts when group by level + grouped = self.mframe.groupby(level=0) + ping = grouped.grouper.groupings[0] + the_counts = grouped.size() + other_counts = Series(ping.counts, ping.group_index) + assert_almost_equal(the_counts, + other_counts.reindex(the_counts.index)) + + def test_groupby_level(self): + frame = self.mframe + deleveled = frame.reset_index() + + result0 = frame.groupby(level=0).sum() + result1 = frame.groupby(level=1).sum() + + expected0 = frame.groupby(deleveled['first'].values).sum() + expected1 = frame.groupby(deleveled['second'].values).sum() + + expected0 = expected0.reindex(frame.index.levels[0]) + expected1 = expected1.reindex(frame.index.levels[1]) + + self.assert_(result0.index.name == 'first') + self.assert_(result1.index.name == 'second') + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + self.assertEquals(result0.index.name, frame.index.names[0]) + self.assertEquals(result1.index.name, frame.index.names[1]) + + # groupby level name + result0 = frame.groupby(level='first').sum() + result1 = frame.groupby(level='second').sum() + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + # axis=1 + + result0 = frame.T.groupby(level=0, axis=1).sum() + result1 = frame.T.groupby(level=1, axis=1).sum() + assert_frame_equal(result0, expected0.T) + assert_frame_equal(result1, expected1.T) + + # raise exception for non-MultiIndex + self.assertRaises(ValueError, self.df.groupby, level=1) + + def test_groupby_level_apply(self): + frame = self.mframe + + result = frame.groupby(level=0).count() + self.assert_(result.index.name == 'first') + result = frame.groupby(level=1).count() + self.assert_(result.index.name == 'second') + + result = frame['A'].groupby(level=0).count() + self.assert_(result.index.name == 'first') + + def test_groupby_level_mapper(self): + frame = self.mframe + deleveled = frame.reset_index() + + mapper0 = {'foo' : 0, 'bar' : 0, + 'baz' : 1, 'qux' : 1} + mapper1 = {'one' : 0, 'two' : 0, 'three' : 1} + + result0 = frame.groupby(mapper0, level=0).sum() + result1 = frame.groupby(mapper1, level=1).sum() + + mapped_level0 = np.array([mapper0.get(x) for x in deleveled['first']]) + mapped_level1 = np.array([mapper1.get(x) for x in deleveled['second']]) + expected0 = frame.groupby(mapped_level0).sum() + expected1 = frame.groupby(mapped_level1).sum() + + assert_frame_equal(result0, expected0) + assert_frame_equal(result1, expected1) + + def test_groupby_level_0_nonmulti(self): + # #1313 + a = Series([1,2,3,10,4,5,20,6], Index([1,2,3,1,4,5,2,6], name='foo')) + + result = a.groupby(level=0).sum() + self.assertEquals(result.index.name, a.index.name) + + def test_level_preserve_order(self): + grouped = self.mframe.groupby(level=0) + exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3]) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + def test_grouping_labels(self): + grouped = self.mframe.groupby(self.mframe.index.get_level_values(0)) + exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3]) + assert_almost_equal(grouped.grouper.labels[0], exp_labels) + + def test_cython_fail_agg(self): + dr = bdate_range('1/1/2000', periods=50) + ts = Series(['A', 'B', 'C', 'D', 'E'] * 10, index=dr) + + grouped = ts.groupby(lambda x: x.month) + summed = grouped.sum() + expected = grouped.agg(np.sum) + assert_series_equal(summed, expected) + + def test_apply_series_to_frame(self): + def f(piece): + return DataFrame({'value' : piece, + 'demeaned' : piece - piece.mean(), + 'logged' : np.log(piece)}) + + dr = bdate_range('1/1/2000', periods=100) + ts = Series(np.random.randn(100), index=dr) + + grouped = ts.groupby(lambda x: x.month) + result = grouped.apply(f) + + self.assert_(isinstance(result, DataFrame)) + self.assert_(result.index.equals(ts.index)) + + def test_apply_series_yield_constant(self): + result = self.df.groupby(['A', 'B'])['C'].apply(len) + self.assertEquals(result.index.names[:2], ['A', 'B']) + + def test_apply_frame_to_series(self): + grouped = self.df.groupby(['A', 'B']) + result = grouped.apply(len) + expected = grouped.count()['C'] + self.assert_(result.index.equals(expected.index)) + self.assert_(np.array_equal(result.values, expected.values)) + + def test_apply_frame_concat_series(self): + def trans(group): + return group.groupby('B')['C'].sum().order()[:2] + + def trans2(group): + grouped = group.groupby(df.reindex(group.index)['B']) + return grouped.sum().order()[:2] + + df = DataFrame({'A': np.random.randint(0, 5, 1000), + 'B': np.random.randint(0, 5, 1000), + 'C': np.random.randn(1000)}) + + result = df.groupby('A').apply(trans) + exp = df.groupby('A')['C'].apply(trans2) + assert_series_equal(result, exp) + + def test_apply_transform(self): + grouped = self.ts.groupby(lambda x: x.month) + result = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + assert_series_equal(result, expected) + + def test_apply_multikey_corner(self): + grouped = self.tsframe.groupby([lambda x: x.year, + lambda x: x.month]) + + def f(group): + return group.sort('A')[-5:] + + result = grouped.apply(f) + for key, group in grouped: + assert_frame_equal(result.ix[key], f(group)) + + def test_groupby_series_indexed_differently(self): + s1 = Series([5.0,-9.0,4.0,100.,-5.,55.,6.7], + index=Index(['a','b','c','d','e','f','g'])) + s2 = Series([1.0,1.0,4.0,5.0,5.0,7.0], + index=Index(['a','b','d','f','g','h'])) + + grouped = s1.groupby(s2) + agged = grouped.mean() + exp = s1.groupby(s2.reindex(s1.index).get).mean() + assert_series_equal(agged, exp) + + def test_groupby_with_hier_columns(self): + tuples = zip(*[['bar', 'bar', 'baz', 'baz', + 'foo', 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', + 'one', 'two', 'one', 'two']]) + index = MultiIndex.from_tuples(tuples) + columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), + ('B', 'cat'), ('A', 'dog')]) + df = DataFrame(np.random.randn(8, 4), index=index, + columns=columns) + + result = df.groupby(level=0).mean() + self.assert_(result.columns.equals(columns)) + + result = df.groupby(level=0, axis=1).mean() + self.assert_(result.index.equals(df.index)) + + result = df.groupby(level=0).agg(np.mean) + self.assert_(result.columns.equals(columns)) + + result = df.groupby(level=0).apply(lambda x: x.mean()) + self.assert_(result.columns.equals(columns)) + + result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1)) + self.assert_(result.columns.equals(Index(['A', 'B']))) + self.assert_(result.index.equals(df.index)) + + # add a nuisance column + sorted_columns, _ = columns.sortlevel(0) + df['A', 'foo'] = 'bar' + result = df.groupby(level=0).mean() + self.assert_(result.columns.equals(df.columns[:-1])) + + def test_pass_args_kwargs(self): + from pandas.compat.scipy import scoreatpercentile + + def f(x, q=None): + return scoreatpercentile(x, q) + g = lambda x: scoreatpercentile(x, 80) + + # Series + ts_grouped = self.ts.groupby(lambda x: x.month) + agg_result = ts_grouped.agg(scoreatpercentile, 80) + apply_result = ts_grouped.apply(scoreatpercentile, 80) + trans_result = ts_grouped.transform(scoreatpercentile, 80) + + agg_expected = ts_grouped.quantile(.8) + trans_expected = ts_grouped.transform(g) + + assert_series_equal(apply_result, agg_expected) + assert_series_equal(agg_result, agg_expected) + assert_series_equal(trans_result, trans_expected) + + agg_result = ts_grouped.agg(f, q=80) + apply_result = ts_grouped.apply(f, q=80) + trans_result = ts_grouped.transform(f, q=80) + assert_series_equal(agg_result, agg_expected) + assert_series_equal(apply_result, agg_expected) + assert_series_equal(trans_result, trans_expected) + + # DataFrame + df_grouped = self.tsframe.groupby(lambda x: x.month) + agg_result = df_grouped.agg(scoreatpercentile, 80) + apply_result = df_grouped.apply(DataFrame.quantile, .8) + expected = df_grouped.quantile(.8) + assert_frame_equal(apply_result, expected) + assert_frame_equal(agg_result, expected) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=.8) + assert_frame_equal(agg_result, expected) + assert_frame_equal(apply_result, expected) + + # def test_cython_na_bug(self): + # values = np.random.randn(10) + # shape = (5, 5) + # label_list = [np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype=np.int32), + # np.array([1, 2, 3, 4, 0, 1, 2, 3, 3, 4], dtype=np.int32)] + + # lib.group_aggregate(values, label_list, shape) + + def test_size(self): + grouped = self.df.groupby(['A', 'B']) + result = grouped.size() + for key, group in grouped: + self.assertEquals(result[key], len(group)) + + grouped = self.df.groupby('A') + result = grouped.size() + for key, group in grouped: + self.assertEquals(result[key], len(group)) + + grouped = self.df.groupby('B') + result = grouped.size() + for key, group in grouped: + self.assertEquals(result[key], len(group)) + + def test_grouping_ndarray(self): + grouped = self.df.groupby(self.df['A'].values) + + result = grouped.sum() + expected = self.df.groupby('A').sum() + assert_frame_equal(result, expected) + + def test_apply_typecast_fail(self): + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + def test_apply_multiindex_fail(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}, index=index) + + def f(group): + v = group['v'] + group['v2'] = (v - v.min()) / (v.max() - v.min()) + return group + + result = df.groupby('d').apply(f) + + expected = df.copy() + expected['v2'] = np.tile([0., 0.5, 1], 2) + + assert_frame_equal(result, expected) + + def test_apply_corner(self): + result = self.tsframe.groupby(lambda x: x.year).apply(lambda x: x * 2) + expected = self.tsframe * 2 + assert_frame_equal(result, expected) + + def test_transform_mixed_type(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]]) + df = DataFrame({'d' : [1.,1.,1.,2.,2.,2.], + 'c' : np.tile(['a','b','c'], 2), + 'v' : np.arange(1., 7.)}, index=index) + + def f(group): + group['g'] = group['d'] * 2 + return group[:1] + + grouped = df.groupby('c') + result = grouped.apply(f) + + self.assert_(result['d'].dtype == np.float64) + + for key, group in grouped: + res = f(group) + assert_frame_equal(res, result.ix[key]) + + def test_groupby_wrong_multi_labels(self): + from pandas import read_csv + from pandas.util.py3compat import StringIO + data = """index,foo,bar,baz,spam,data +0,foo1,bar1,baz1,spam2,20 +1,foo1,bar2,baz1,spam3,30 +2,foo2,bar2,baz1,spam2,40 +3,foo1,bar1,baz2,spam1,50 +4,foo3,bar1,baz2,spam1,60""" + data = read_csv(StringIO(data), index_col=0) + + grouped = data.groupby(['foo', 'bar', 'baz', 'spam']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_groupby_series_with_name(self): + result = self.df.groupby(self.df['A']).mean() + result2 = self.df.groupby(self.df['A'], as_index=False).mean() + self.assertEquals(result.index.name, 'A') + self.assert_('A' in result2) + + result = self.df.groupby([self.df['A'], self.df['B']]).mean() + result2 = self.df.groupby([self.df['A'], self.df['B']], + as_index=False).mean() + self.assertEquals(result.index.names, ['A', 'B']) + self.assert_('A' in result2) + self.assert_('B' in result2) + + def test_groupby_nonstring_columns(self): + df = DataFrame([np.arange(10) for x in range(10)]) + grouped = df.groupby(0) + result = grouped.mean() + expected = df.groupby(df[0]).mean() + del expected[0] + assert_frame_equal(result, expected) + + def test_cython_grouper_series_bug_noncontig(self): + arr = np.empty((100, 100)) + arr.fill(np.nan) + obj = Series(arr[:, 0], index=range(100)) + inds = np.tile(range(10), 10) + + result = obj.groupby(inds).agg(Series.median) + self.assert_(result.isnull().all()) + + def test_convert_objects_leave_decimal_alone(self): + from decimal import Decimal + + s = Series(range(5)) + labels = np.array(['a', 'b', 'c', 'd', 'e'], dtype='O') + + def convert_fast(x): + return Decimal(str(x.mean())) + + def convert_force_pure(x): + # base will be length 0 + assert(len(x.base) == len(x)) + return Decimal(str(x.mean())) + + grouped = s.groupby(labels) + + result = grouped.agg(convert_fast) + self.assert_(result.dtype == np.object_) + self.assert_(isinstance(result[0], Decimal)) + + result = grouped.agg(convert_force_pure) + self.assert_(result.dtype == np.object_) + self.assert_(isinstance(result[0], Decimal)) + + def test_groupby_list_infer_array_like(self): + result = self.df.groupby(list(self.df['A'])).mean() + expected = self.df.groupby(self.df['A']).mean() + assert_frame_equal(result, expected) + + self.assertRaises(Exception, self.df.groupby, list(self.df['A'][:-1])) + + # pathological case of ambiguity + df = DataFrame({'foo' : [0, 1], 'bar' : [3, 4], + 'val' : np.random.randn(2)}) + + result = df.groupby(['foo', 'bar']).mean() + expected = df.groupby([df['foo'], df['bar']]).mean()[['val']] + + def test_dictify(self): + dict(iter(self.df.groupby('A'))) + dict(iter(self.df.groupby(['A', 'B']))) + dict(iter(self.df['C'].groupby(self.df['A']))) + dict(iter(self.df['C'].groupby([self.df['A'], self.df['B']]))) + dict(iter(self.df.groupby('A')['C'])) + dict(iter(self.df.groupby(['A', 'B'])['C'])) + + def test_sparse_friendly(self): + sdf = self.df[['C', 'D']].to_sparse() + panel = tm.makePanel() + tm.add_nans(panel) + + def _check_work(gp): + gp.mean() + gp.agg(np.mean) + dict(iter(gp)) + + # it works! + _check_work(sdf.groupby(lambda x: x // 2)) + _check_work(sdf['C'].groupby(lambda x: x // 2)) + _check_work(sdf.groupby(self.df['A'])) + + # do this someday + # _check_work(panel.groupby(lambda x: x.month, axis=1)) + + def test_panel_groupby(self): + self.panel = tm.makePanel() + tm.add_nans(self.panel) + grouped = self.panel.groupby({'ItemA' : 0, 'ItemB' : 0, 'ItemC' : 1}, + axis='items') + agged = grouped.mean() + agged2 = grouped.agg(lambda x: x.mean('items')) + + tm.assert_panel_equal(agged, agged2) + + self.assert_(np.array_equal(agged.items, [0, 1])) + + grouped = self.panel.groupby(lambda x: x.month, axis='major') + agged = grouped.mean() + + self.assert_(np.array_equal(agged.major_axis, [1, 2])) + + grouped = self.panel.groupby({'A' : 0, 'B' : 0, 'C' : 1, 'D' : 1}, + axis='minor') + agged = grouped.mean() + self.assert_(np.array_equal(agged.minor_axis, [0, 1])) + + def test_numpy_groupby(self): + from pandas.core.groupby import numpy_groupby + + data = np.random.randn(100, 100) + labels = np.random.randint(0, 10, size=100) + + df = DataFrame(data) + + result = df.groupby(labels).sum().values + expected = numpy_groupby(data, labels) + assert_almost_equal(result, expected) + + result = df.groupby(labels, axis=1).sum().values + expected = numpy_groupby(data, labels, axis=1) + assert_almost_equal(result, expected) + + def test_groupby_2d_malformed(self): + d = DataFrame(index=range(2)) + d['group'] = ['g1', 'g2'] + d['zeros'] = [0, 0] + d['ones'] = [1, 1] + d['label'] = ['l1', 'l2'] + tmp = d.groupby(['group']).mean() + res_values = np.array([[0., 1.], [0., 1.]]) + self.assert_(np.array_equal(tmp.columns, ['zeros', 'ones'])) + self.assert_(np.array_equal(tmp.values, res_values)) + + def test_int32_overflow(self): + B = np.concatenate((np.arange(10000), np.arange(10000), + np.arange(5000))) + A = np.arange(25000) + df = DataFrame({'A' : A, 'B' : B, + 'C' : A, 'D' : B, + 'E' : np.random.randn(25000)}) + + left = df.groupby(['A', 'B', 'C', 'D']).sum() + right = df.groupby(['D', 'C', 'B', 'A']).sum() + self.assert_(len(left) == len(right)) + + def test_int64_overflow(self): + B = np.concatenate((np.arange(1000), np.arange(1000), + np.arange(500))) + A = np.arange(2500) + df = DataFrame({'A' : A, 'B' : B, + 'C' : A, 'D' : B, + 'E' : A, 'F' : B, + 'G' : A, 'H' : B, + 'values' : np.random.randn(2500)}) + + lg = df.groupby(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']) + rg = df.groupby(['H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']) + + left = lg.sum()['values'] + right = rg.sum()['values'] + + exp_index, _ = left.index.sortlevel(0) + self.assert_(left.index.equals(exp_index)) + + exp_index, _ = right.index.sortlevel(0) + self.assert_(right.index.equals(exp_index)) + + tups = map(tuple, df[['A', 'B', 'C', 'D', + 'E', 'F', 'G', 'H']].values) + tups = com._asarray_tuplesafe(tups) + expected = df.groupby(tups).sum()['values'] + + for k, v in expected.iteritems(): + self.assert_(left[k] == right[k[::-1]] == v) + self.assert_(len(left) == len(right)) + + def test_groupby_sort_multi(self): + df = DataFrame({'a' : ['foo', 'bar', 'baz'], + 'b' : [3, 2, 1], + 'c' : [0, 1, 2], + 'd' : np.random.randn(3)}) + + tups = map(tuple, df[['a', 'b', 'c']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['a', 'b', 'c'], sort=True).sum() + self.assert_(np.array_equal(result.index.values, + tups[[1, 2, 0]])) + + tups = map(tuple, df[['c', 'a', 'b']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['c', 'a', 'b'], sort=True).sum() + self.assert_(np.array_equal(result.index.values, tups)) + + tups = map(tuple, df[['b', 'c', 'a']].values) + tups = com._asarray_tuplesafe(tups) + result = df.groupby(['b', 'c', 'a'], sort=True).sum() + self.assert_(np.array_equal(result.index.values, + tups[[2, 1, 0]])) + + df = DataFrame({'a' : [0, 1, 2, 0, 1, 2], + 'b' : [0, 0, 0, 1, 1, 1], + 'd' : np.random.randn(6)}) + grouped = df.groupby(['a', 'b'])['d'] + result = grouped.sum() + _check_groupby(df, result, ['a', 'b'], 'd') + + def test_intercept_builtin_sum(self): + import __builtin__ + s = Series([1., 2., np.nan, 3.]) + grouped = s.groupby([0, 1, 2, 2]) + + result = grouped.agg(__builtin__.sum) + result2 = grouped.apply(__builtin__.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + def test_column_select_via_attr(self): + result = self.df.groupby('A').C.sum() + expected = self.df.groupby('A')['C'].sum() + assert_series_equal(result, expected) + + self.df['mean'] = 1.5 + result = self.df.groupby('A').mean() + expected = self.df.groupby('A').agg(np.mean) + assert_frame_equal(result, expected) + + def test_rank_apply(self): + lev1 = np.array([rands(10) for _ in xrange(1000)], dtype=object) + lev2 = np.array([rands(10) for _ in xrange(130)], dtype=object) + lab1 = np.random.randint(0, 1000, size=5000) + lab2 = np.random.randint(0, 130, size=5000) + + df = DataFrame({'value' : np.random.randn(5000), + 'key1' : lev1.take(lab1), + 'key2' : lev2.take(lab2)}) + + result = df.groupby(['key1', 'key2']).value.rank() + + expected = [] + for key, piece in df.groupby(['key1', 'key2']): + expected.append(piece.value.rank()) + expected = concat(expected, axis=0) + expected = expected.reindex(result.index) + + assert_series_equal(result, expected) + + def test_dont_clobber_name_column(self): + df = DataFrame({'key': ['a', 'a', 'a', 'b', 'b', 'b'], + 'name' : ['foo', 'bar', 'baz'] * 2}) + + result = df.groupby('key').apply(lambda x: x) + assert_frame_equal(result, df) + + def test_skip_group_keys(self): + from pandas import concat + + tsf = tm.makeTimeDataFrame() + + grouped = tsf.groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.sort_index(by='A')[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.sort_index(by='A')[:3]) + + expected = concat(pieces) + assert_frame_equal(result, expected) + + grouped = tsf['A'].groupby(lambda x: x.month, group_keys=False) + result = grouped.apply(lambda x: x.order()[:3]) + + pieces = [] + for key, group in grouped: + pieces.append(group.order()[:3]) + + expected = concat(pieces) + assert_series_equal(result, expected) + + def test_no_nonsense_name(self): + # GH #995 + s = self.frame['C'].copy() + s.name = None + + result = s.groupby(self.frame['A']).agg(np.sum) + self.assert_(result.name is None) + + def test_wrap_agg_out(self): + grouped = self.three_group.groupby(['A', 'B']) + def func(ser): + if ser.dtype == np.object: + raise TypeError + else: + return ser.sum() + result = grouped.aggregate(func) + exp_grouped = self.three_group.ix[:, self.three_group.columns != 'C'] + expected = exp_grouped.groupby(['A', 'B']).aggregate(func) + assert_frame_equal(result, expected) + + def test_multifunc_sum_bug(self): + # GH #1065 + x = DataFrame(np.arange(9).reshape(3,3)) + x['test']=0 + x['fl']= [1.3,1.5,1.6] + + grouped = x.groupby('test') + result = grouped.agg({'fl':'sum',2:'size'}) + self.assert_(result['fl'].dtype == np.float64) + + def test_handle_dict_return_value(self): + def f(group): + return {'min': group.min(), 'max': group.max()} + + def g(group): + return Series({'min': group.min(), 'max': group.max()}) + + result = self.df.groupby('A')['C'].apply(f) + expected = self.df.groupby('A')['C'].apply(g) + + self.assert_(isinstance(result, Series)) + assert_series_equal(result, expected) + + def test_getitem_list_of_columns(self): + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8), + 'E': np.random.randn(8)}) + + result = df.groupby('A')[['C', 'D']].mean() + result2 = df.groupby('A')['C', 'D'].mean() + result3 = df.groupby('A')[df.columns[2:4]].mean() + + expected = df.ix[:, ['A', 'C', 'D']].groupby('A').mean() + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + assert_frame_equal(result3, expected) + + def test_agg_multiple_functions_maintain_order(self): + # GH #610 + funcs = [('mean', np.mean), ('max', np.max), ('min', np.min)] + result = self.df.groupby('A')['C'].agg(funcs) + exp_cols = ['mean', 'max', 'min'] + + self.assert_(np.array_equal(result.columns, exp_cols)) + + def test_multiple_functions_tuples_and_non_tuples(self): + # #1359 + + funcs = [('foo', 'mean'), 'std'] + ex_funcs = [('foo', 'mean'), ('std', 'std')] + + result = self.df.groupby('A')['C'].agg(funcs) + expected = self.df.groupby('A')['C'].agg(ex_funcs) + assert_frame_equal(result, expected) + + result = self.df.groupby('A').agg(funcs) + expected = self.df.groupby('A').agg(ex_funcs) + assert_frame_equal(result, expected) + + def test_more_flexible_frame_multi_function(self): + from pandas import concat + + grouped = self.df.groupby('A') + + exmean = grouped.agg({'C' : np.mean, 'D' : np.mean}) + exstd = grouped.agg({'C' : np.std, 'D' : np.std}) + + expected = concat([exmean, exstd], keys=['mean', 'std'], axis=1) + expected = expected.swaplevel(0, 1, axis=1).sortlevel(0, axis=1) + + result = grouped.aggregate({'C' : [np.mean, np.std], + 'D' : [np.mean, np.std]}) + + assert_frame_equal(result, expected) + + # be careful + result = grouped.aggregate({'C' : np.mean, + 'D' : [np.mean, np.std]}) + expected = grouped.aggregate({'C' : [np.mean], + 'D' : [np.mean, np.std]}) + assert_frame_equal(result, expected) + + + def foo(x): return np.mean(x) + def bar(x): return np.std(x, ddof=1) + result = grouped.aggregate({'C' : np.mean, + 'D' : {'foo': np.mean, + 'bar': np.std}}) + expected = grouped.aggregate({'C' : [np.mean], + 'D' : [foo, bar]}) + assert_frame_equal(result, expected) + + def test_multi_function_flexible_mix(self): + # GH #1268 + + grouped = self.df.groupby('A') + + result = grouped.aggregate({'C' : {'foo' : 'mean', + 'bar' : 'std'}, + 'D' : 'sum'}) + result2 = grouped.aggregate({'C' : {'foo' : 'mean', + 'bar' : 'std'}, + 'D' : ['sum']}) + + expected = grouped.aggregate({'C' : {'foo' : 'mean', + 'bar' : 'std'}, + 'D' : {'sum' : 'sum'}}) + + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_set_group_name(self): + def f(group): + assert group.name is not None + return group + + def freduce(group): + assert group.name is not None + return group.sum() + + def foo(x): + return freduce(x) + + def _check_all(grouped): + # make sure all these work + grouped.apply(f) + grouped.aggregate(freduce) + grouped.aggregate({'C': freduce, 'D': freduce}) + grouped.transform(f) + + grouped['C'].apply(f) + grouped['C'].aggregate(freduce) + grouped['C'].aggregate([freduce, foo]) + grouped['C'].transform(f) + + _check_all(self.df.groupby('A')) + _check_all(self.df.groupby(['A', 'B'])) + + def test_no_dummy_key_names(self): + # GH #1291 + + result = self.df.groupby(self.df['A'].values).sum() + self.assert_(result.index.name is None) + + result = self.df.groupby([self.df['A'].values, + self.df['B'].values]).sum() + self.assert_(result.index.names == [None, None]) + + def test_groupby_categorical(self): + levels = ['foo', 'bar', 'baz', 'qux'] + labels = np.random.randint(0, 4, size=100) + + cats = Categorical(labels, levels, name='myfactor') + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + + assert_frame_equal(result, expected) + self.assert_(result.index.name == cats.name) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.labels.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels, sort=False).describe() + assert_frame_equal(desc_result, expected) + + def test_groupby_groups_datetimeindex(self): + # #1430 + from pandas.tseries.api import DatetimeIndex + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange(periods), + 'low': np.arange(periods)}, index=ind) + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + + # it works! + groups = grouped.groups + self.assert_(isinstance(groups.keys()[0], datetime)) + + def test_groupby_reindex_inside_function(self): + from pandas.tseries.api import DatetimeIndex + + periods = 1000 + ind = DatetimeIndex(start='2012/1/1', freq='5min', periods=periods) + df = DataFrame({'high': np.arange(periods), 'low': np.arange(periods)}, index=ind) + + def agg_before(hour, func, fix=False): + """ + Run an aggregate func on the subset of data. + """ + def _func(data): + d = data.select(lambda x: x.hour < 11).dropna() + if fix: + data[data.index[0]] + if len(d) == 0: + return None + return func(d) + return _func + + def afunc(data): + d = data.select(lambda x: x.hour < 11).dropna() + return np.max(d) + + grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) + closure_bad = grouped.agg({'high': agg_before(11, np.max)}) + closure_good = grouped.agg({'high': agg_before(11, np.max, True)}) + + assert_frame_equal(closure_bad, closure_good) + + def test_multiindex_columns_empty_level(self): + l = [['count', 'values'], ['to filter', '']] + midx = MultiIndex.from_tuples(l) + + df = DataFrame([[1L, 'A']], columns=midx) + + grouped = df.groupby('to filter').groups + self.assert_(np.array_equal(grouped['A'], [0])) + + grouped = df.groupby([('to filter', '')]).groups + self.assert_(np.array_equal(grouped['A'], [0])) + + df = DataFrame([[1L, 'A'], [2L, 'B']], columns=midx) + + expected = df.groupby('to filter').groups + result = df.groupby([('to filter', '')]).groups + self.assertEquals(result, expected) + + df = DataFrame([[1L, 'A'], [2L, 'A']], columns=midx) + + expected = df.groupby('to filter').groups + result = df.groupby([('to filter', '')]).groups + self.assertEquals(result, expected) + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = map(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in expected.iteritems(): + assert(result[k] == v) + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert(np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), + np.tile([0, 2, 4, 3, 0, 1, 2, 3], 100), + np.tile([5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), + np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py new file mode 100644 index 00000000..1f74eda4 --- /dev/null +++ b/pandas/tests/test_index.py @@ -0,0 +1,1572 @@ +# pylint: disable=E1101,E1103,W0232 + +from datetime import datetime, timedelta +import operator +import pickle +import unittest +import nose +import os + +import numpy as np +from numpy.testing import assert_array_equal + +from pandas.core.categorical import Factor +from pandas.core.index import Index, Int64Index, MultiIndex +from pandas.util.testing import assert_almost_equal +from pandas.util import py3compat +import pandas.core.common as com + +import pandas.util.testing as tm + +from pandas.tseries.index import _to_m8 +import pandas.tseries.offsets as offsets + +class TestIndex(unittest.TestCase): + + def setUp(self): + self.strIndex = tm.makeStringIndex(100) + self.dateIndex = tm.makeDateIndex(100) + self.intIndex = tm.makeIntIndex(100) + self.floatIndex = tm.makeFloatIndex(100) + self.empty = Index([]) + self.tuples = Index(zip(['foo', 'bar', 'baz'], [1, 2, 3])) + + def test_hash_error(self): + self.assertRaises(TypeError, hash, self.strIndex) + + def test_new_axis(self): + new_index = self.dateIndex[None, :] + self.assert_(new_index.ndim == 2) + self.assert_(type(new_index) == np.ndarray) + + def test_deepcopy(self): + from copy import deepcopy + + copy = deepcopy(self.strIndex) + self.assert_(copy is self.strIndex) + + def test_duplicates(self): + idx = Index([0, 0, 0]) + self.assert_(not idx.is_unique) + + def test_sort(self): + self.assertRaises(Exception, self.strIndex.sort) + + def test_mutability(self): + self.assertRaises(Exception, self.strIndex.__setitem__, 5, 0) + self.assertRaises(Exception, self.strIndex.__setitem__, slice(1,5), 0) + + def test_constructor(self): + # regular instance creation + tm.assert_contains_all(self.strIndex, self.strIndex) + tm.assert_contains_all(self.dateIndex, self.dateIndex) + + # casting + arr = np.array(self.strIndex) + index = arr.view(Index) + tm.assert_contains_all(arr, index) + self.assert_(np.array_equal(self.strIndex, index)) + + # copy + arr = np.array(self.strIndex) + index = Index(arr, copy=True, name='name') + self.assert_(isinstance(index, Index)) + self.assert_(index.name == 'name') + assert_array_equal(arr, index) + + # what to do here? + # arr = np.array(5.) + # self.assertRaises(Exception, arr.view, Index) + + def test_constructor_corner(self): + # corner case + self.assertRaises(Exception, Index, 0) + + def test_astype(self): + casted = self.intIndex.astype('i8') + + # it works! + casted.get_loc(5) + + # pass on name + self.intIndex.name = 'foobar' + casted = self.intIndex.astype('i8') + self.assertEqual(casted.name, 'foobar') + + def test_compat(self): + self.strIndex.tolist() + + def test_equals(self): + # same + self.assert_(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'c']))) + + # different length + self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b']))) + + # same length, different values + self.assertFalse(Index(['a', 'b', 'c']).equals(Index(['a', 'b', 'd']))) + + # Must also be an Index + self.assertFalse(Index(['a', 'b', 'c']).equals(['a', 'b', 'c'])) + + def test_asof(self): + d = self.dateIndex[0] + self.assert_(self.dateIndex.asof(d) is d) + self.assert_(np.isnan(self.dateIndex.asof(d - timedelta(1)))) + + d = self.dateIndex[-1] + self.assert_(self.dateIndex.asof(d + timedelta(1)) == d) + + def test_argsort(self): + result = self.strIndex.argsort() + expected = np.array(self.strIndex).argsort() + self.assert_(np.array_equal(result, expected)) + + def test_comparators(self): + index = self.dateIndex + element = index[len(index) // 2] + element = _to_m8(element) + + arr = np.array(index) + + def _check(op): + arr_result = op(arr, element) + index_result = op(index, element) + + self.assert_(isinstance(index_result, np.ndarray)) + self.assert_(not isinstance(index_result, Index)) + self.assert_(np.array_equal(arr_result, index_result)) + + _check(operator.eq) + _check(operator.ne) + _check(operator.gt) + _check(operator.lt) + _check(operator.ge) + _check(operator.le) + + def test_booleanindex(self): + boolIdx = np.repeat(True, len(self.strIndex)).astype(bool) + boolIdx[5:30:2] = False + + subIndex = self.strIndex[boolIdx] + + for i, val in enumerate(subIndex): + self.assertEqual(subIndex.get_loc(val), i) + + subIndex = self.strIndex[list(boolIdx)] + for i, val in enumerate(subIndex): + self.assertEqual(subIndex.get_loc(val), i) + + def test_fancy(self): + sl = self.strIndex[[1,2,3]] + for i in sl: + self.assertEqual(i, sl[sl.get_loc(i)]) + + def test_getitem(self): + arr = np.array(self.dateIndex) + exp = self.dateIndex[5] + exp = _to_m8(exp) + + self.assertEquals(exp, arr[5]) + + def test_shift(self): + shifted = self.dateIndex.shift(0, timedelta(1)) + self.assert_(shifted is self.dateIndex) + + shifted = self.dateIndex.shift(5, timedelta(1)) + self.assert_(np.array_equal(shifted, self.dateIndex + timedelta(5))) + + shifted = self.dateIndex.shift(1, 'B') + self.assert_(np.array_equal(shifted, self.dateIndex + offsets.BDay())) + + def test_intersection(self): + first = self.strIndex[:20] + second = self.strIndex[:10] + intersect = first.intersection(second) + + self.assert_(tm.equalContents(intersect, second)) + + # Corner cases + inter = first.intersection(first) + self.assert_(inter is first) + + # non-iterable input + self.assertRaises(Exception, first.intersection, 0.5) + + def test_union(self): + first = self.strIndex[5:20] + second = self.strIndex[:10] + everything = self.strIndex[:20] + union = first.union(second) + self.assert_(tm.equalContents(union, everything)) + + # Corner cases + union = first.union(first) + self.assert_(union is first) + + union = first.union([]) + self.assert_(union is first) + + union = Index([]).union(first) + self.assert_(union is first) + + # non-iterable input + self.assertRaises(Exception, first.union, 0.5) + + # preserve names + first.name = 'A' + second.name = 'A' + union = first.union(second) + self.assert_(union.name == 'A') + + second.name = 'B' + union = first.union(second) + self.assert_(union.name is None) + + def test_add(self): + firstCat = self.strIndex + self.dateIndex + secondCat = self.strIndex + self.strIndex + + if self.dateIndex.dtype == np.object_: + appended = np.append(self.strIndex, self.dateIndex) + else: + appended = np.append(self.strIndex, self.dateIndex.astype('O')) + + self.assert_(tm.equalContents(firstCat, appended)) + self.assert_(tm.equalContents(secondCat, self.strIndex)) + tm.assert_contains_all(self.strIndex, firstCat) + tm.assert_contains_all(self.strIndex, secondCat) + tm.assert_contains_all(self.dateIndex, firstCat) + + def test_append_multiple(self): + index = Index(['a', 'b', 'c', 'd', 'e', 'f']) + + foos = [index[:2], index[2:4], index[4:]] + result = foos[0].append(foos[1:]) + self.assert_(result.equals(index)) + + # empty + result = index.append([]) + self.assert_(result.equals(index)) + + def test_append_empty_preserve_name(self): + left = Index([], name='foo') + right = Index([1, 2, 3], name='foo') + + result = left.append(right) + self.assert_(result.name == 'foo') + + left = Index([], name='foo') + right = Index([1, 2, 3], name='bar') + + result = left.append(right) + self.assert_(result.name is None) + + def test_add_string(self): + # from bug report + index = Index(['a', 'b', 'c']) + index2 = index + 'foo' + + self.assert_('a' not in index2) + self.assert_('afoo' in index2) + + def test_diff(self): + first = self.strIndex[5:20] + second = self.strIndex[:10] + answer = self.strIndex[10:20] + result = first - second + + self.assert_(tm.equalContents(result, answer)) + + diff = first.diff(first) + self.assert_(len(diff) == 0) + + # non-iterable input + self.assertRaises(Exception, first.diff, 0.5) + + def test_pickle(self): + def testit(index): + pickled = pickle.dumps(index) + unpickled = pickle.loads(pickled) + + self.assert_(isinstance(unpickled, Index)) + self.assert_(np.array_equal(unpickled, index)) + self.assertEquals(unpickled.name, index.name) + + # tm.assert_dict_equal(unpickled.indexMap, index.indexMap) + + testit(self.strIndex) + self.strIndex.name = 'foo' + testit(self.strIndex) + + testit(self.dateIndex) + + def test_is_numeric(self): + self.assert_(not self.dateIndex.is_numeric()) + self.assert_(not self.strIndex.is_numeric()) + self.assert_(self.intIndex.is_numeric()) + self.assert_(self.floatIndex.is_numeric()) + + def test_is_all_dates(self): + self.assert_(self.dateIndex.is_all_dates) + self.assert_(not self.strIndex.is_all_dates) + self.assert_(not self.intIndex.is_all_dates) + + def test_summary(self): + self._check_method_works(Index.summary) + + def test_format(self): + self._check_method_works(Index.format) + + index = Index([datetime.now()]) + formatted = index.format() + expected = [str(index[0])] + self.assertEquals(formatted, expected) + + self.strIndex[:0].format() + + def test_format_with_name_time_info(self): + # bug I fixed 12/20/2011 + inc = timedelta(hours=4) + dates = Index([dt + inc for dt in self.dateIndex], name='something') + + formatted = dates.format(name=True) + self.assert_(formatted[0] == 'something') + + def test_format_datetime_with_time(self): + t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)]) + + result = t.format() + expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00'] + self.assert_(len(result) == 2) + self.assertEquals(result, expected) + + def test_format_none(self): + values = ['a', 'b', 'c', None] + + idx = Index(values) + idx.format() + self.assert_(idx[3] is None) + + def test_take(self): + indexer = [4, 3, 0, 2] + result = self.dateIndex.take(indexer) + expected = self.dateIndex[indexer] + self.assert_(result.equals(expected)) + + def _check_method_works(self, method): + method(self.empty) + method(self.dateIndex) + method(self.strIndex) + method(self.intIndex) + method(self.tuples) + + def test_get_indexer(self): + idx1 = Index([1, 2, 3, 4, 5]) + idx2 = Index([2, 4, 6]) + + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [1, 3, -1]) + + r1 = idx2.get_indexer(idx1, method='pad') + assert_almost_equal(r1, [-1, 0, 0, 1, 1]) + + rffill1 = idx2.get_indexer(idx1, method='ffill') + assert_almost_equal(r1, rffill1) + + r1 = idx2.get_indexer(idx1, method='backfill') + assert_almost_equal(r1, [0, 0, 1, 1, 2]) + + rbfill1 = idx2.get_indexer(idx1, method='bfill') + assert_almost_equal(r1, rbfill1) + + def test_slice_locs(self): + idx = Index([0, 1, 2, 5, 6, 7, 9, 10]) + n = len(idx) + + self.assertEquals(idx.slice_locs(start=2), (2, n)) + self.assertEquals(idx.slice_locs(start=3), (3, n)) + self.assertEquals(idx.slice_locs(3, 8), (3, 6)) + self.assertEquals(idx.slice_locs(5, 10), (3, n)) + self.assertEquals(idx.slice_locs(end=8), (0, 6)) + self.assertEquals(idx.slice_locs(end=9), (0, 7)) + + idx2 = idx[::-1] + self.assertRaises(KeyError, idx2.slice_locs, 8, 2) + self.assertRaises(KeyError, idx2.slice_locs, 7, 3) + + def test_drop(self): + n = len(self.strIndex) + + dropped = self.strIndex.drop(self.strIndex[range(5, 10)]) + expected = self.strIndex[range(5) + range(10, n)] + self.assert_(dropped.equals(expected)) + + self.assertRaises(ValueError, self.strIndex.drop, ['foo', 'bar']) + + dropped = self.strIndex.drop(self.strIndex[0]) + expected = self.strIndex[1:] + self.assert_(dropped.equals(expected)) + + ser = Index([1,2,3]) + dropped = ser.drop(1) + expected = Index([2,3]) + self.assert_(dropped.equals(expected)) + + def test_tuple_union_bug(self): + import pandas + import numpy as np + + aidx1 = np.array([(1, 'A'),(2, 'A'),(1, 'B'),(2, 'B')], dtype=[('num', + int),('let', 'a1')]) + aidx2 = np.array([(1, 'A'),(2, 'A'),(1, 'B'),(2, 'B'),(1,'C'),(2, + 'C')], dtype=[('num', int),('let', 'a1')]) + + idx1 = pandas.Index(aidx1) + idx2 = pandas.Index(aidx2) + + # intersection broken? + int_idx = idx1.intersection(idx2) + # needs to be 1d like idx1 and idx2 + expected = idx1[:4] # pandas.Index(sorted(set(idx1) & set(idx2))) + self.assert_(int_idx.ndim == 1) + self.assert_(int_idx.equals(expected)) + + # union broken + union_idx = idx1.union(idx2) + expected = pandas.Index(sorted(set(idx1) | set(idx2))) + self.assert_(union_idx.ndim == 1) + self.assert_(union_idx.equals(expected)) + + def test_is_monotonic_incomparable(self): + index = Index([5, datetime.now(), 7]) + self.assert_(not index.is_monotonic) + + def test_get_set_value(self): + values = np.random.randn(100) + date = self.dateIndex[67] + + assert_almost_equal(self.dateIndex.get_value(values, date), + values[67]) + + self.dateIndex.set_value(values, date, 10) + self.assertEquals(values[67], 10) + + def test_isin(self): + values = ['foo', 'bar'] + + idx = Index(['qux', 'baz', 'foo', 'bar']) + result = idx.isin(values) + expected = np.array([False, False, True, True]) + self.assert_(np.array_equal(result, expected)) + + # empty, return dtype bool + idx = Index([]) + result = idx.isin(values) + self.assert_(len(result) == 0) + self.assert_(result.dtype == np.bool_) + + def test_boolean_cmp(self): + values = [1,2,3,4] + + idx = Index(values) + res = (idx == values) + + self.assert_(res.all()) + self.assert_(res.dtype == 'bool') + self.assert_(not isinstance(res, Index)) + +class TestInt64Index(unittest.TestCase): + + def setUp(self): + self.index = Int64Index(np.arange(0, 20, 2)) + + def test_constructor(self): + # pass list, coerce fine + index = Int64Index([-5, 0, 1, 2]) + expected = np.array([-5, 0, 1, 2], dtype=np.int64) + self.assert_(np.array_equal(index, expected)) + + # from iterable + index = Int64Index(iter([-5, 0, 1, 2])) + self.assert_(np.array_equal(index, expected)) + + # scalar raise Exception + self.assertRaises(ValueError, Int64Index, 5) + + def test_constructor_corner(self): + arr = np.array([1, 2, 3, 4], dtype=object) + index = Int64Index(arr) + self.assert_(index.values.dtype == np.int64) + self.assert_(index.equals(arr)) + + # preventing casting + arr = np.array([1, '2', 3, '4'], dtype=object) + self.assertRaises(TypeError, Int64Index, arr) + + def test_coerce_list(self): + # coerce things + arr = Index([1, 2, 3, 4]) + self.assert_(type(arr) == Int64Index) + + # but not if explicit dtype passed + arr = Index([1, 2, 3, 4], dtype=object) + self.assert_(type(arr) == Index) + + def test_dtype(self): + self.assert_(self.index.dtype == np.int64) + + def test_is_monotonic(self): + self.assert_(self.index.is_monotonic) + + index = Int64Index([4, 3, 2, 1]) + self.assert_(not index.is_monotonic) + + def test_equals(self): + same_values = Index(self.index, dtype=object) + self.assert_(self.index.equals(same_values)) + self.assert_(same_values.equals(self.index)) + + def test_get_indexer(self): + target = Int64Index(np.arange(10)) + indexer = self.index.get_indexer(target) + expected = np.array([0, -1, 1, -1, 2, -1, 3, -1, 4, -1]) + self.assert_(np.array_equal(indexer, expected)) + + def test_get_indexer_pad(self): + target = Int64Index(np.arange(10)) + indexer = self.index.get_indexer(target, method='pad') + expected = np.array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4]) + self.assert_(np.array_equal(indexer, expected)) + + def test_get_indexer_backfill(self): + target = Int64Index(np.arange(10)) + indexer = self.index.get_indexer(target, method='backfill') + expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5]) + self.assert_(np.array_equal(indexer, expected)) + + def test_join_outer(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + # guarantee of sortedness + res, lidx, ridx = self.index.join(other, how='outer', + return_indexers=True) + noidx_res = self.index.join(other, how='outer') + self.assert_(res.equals(noidx_res)) + + eres = Int64Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25]) + elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], + dtype=np.int64) + eridx = np.array([-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], + dtype=np.int64) + + self.assert_(isinstance(res, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(np.array_equal(ridx, eridx)) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='outer', + return_indexers=True) + noidx_res = self.index.join(other_mono, how='outer') + self.assert_(res.equals(noidx_res)) + + eridx = np.array([-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], + dtype=np.int64) + self.assert_(isinstance(res, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(np.array_equal(ridx, eridx)) + + def test_join_inner(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = self.index.join(other, how='inner', + return_indexers=True) + + # no guarantee of sortedness, so sort for comparison purposes + ind = res.argsort() + res = res.take(ind) + lidx = lidx.take(ind) + ridx = ridx.take(ind) + + eres = Int64Index([2, 12]) + elidx = np.array([1, 6]) + eridx = np.array([4, 1]) + + self.assert_(isinstance(res, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(np.array_equal(ridx, eridx)) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='inner', + return_indexers=True) + + res2 = self.index.intersection(other_mono) + self.assert_(res.equals(res2)) + + eridx = np.array([1, 4]) + self.assert_(isinstance(res, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(np.array_equal(ridx, eridx)) + + def test_join_left(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = self.index.join(other, how='left', + return_indexers=True) + eres = self.index + eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], + dtype=np.int64) + + self.assert_(isinstance(res, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(lidx is None) + self.assert_(np.array_equal(ridx, eridx)) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='left', + return_indexers=True) + eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], + dtype=np.int64) + self.assert_(isinstance(res, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(lidx is None) + self.assert_(np.array_equal(ridx, eridx)) + + # non-unique + """ + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + res, lidx, ridx = idx2.join(idx, how='left', return_indexers=True) + eres = idx2 + eridx = np.array([0, 2, 3, -1, -1]) + elidx = np.array([0, 1, 2, 3, 4]) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(np.array_equal(ridx, eridx)) + """ + + def test_join_right(self): + other = Int64Index([7, 12, 25, 1, 2, 5]) + other_mono = Int64Index([1, 2, 5, 7, 12, 25]) + + # not monotonic + res, lidx, ridx = self.index.join(other, how='right', + return_indexers=True) + eres = other + elidx = np.array([-1, 6, -1, -1, 1, -1], + dtype=np.int64) + + self.assert_(isinstance(other, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(ridx is None) + + # monotonic + res, lidx, ridx = self.index.join(other_mono, how='right', + return_indexers=True) + eres = other_mono + elidx = np.array([-1, 1, -1, -1, 6, -1], + dtype=np.int64) + self.assert_(isinstance(other, Int64Index)) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(ridx is None) + + # non-unique + """ + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + res, lidx, ridx = idx.join(idx2, how='right', return_indexers=True) + eres = idx2 + elidx = np.array([0, 2, 3, -1, -1]) + eridx = np.array([0, 1, 2, 3, 4]) + self.assert_(res.equals(eres)) + self.assert_(np.array_equal(lidx, elidx)) + self.assert_(np.array_equal(ridx, eridx)) + + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,9,7]) + res = idx.join(idx2, how='right', return_indexers=False) + eres = idx2 + self.assert(res.equals(eres)) + """ + + def test_join_non_int_index(self): + other = Index([3, 6, 7, 8, 10], dtype=object) + + outer = self.index.join(other, how='outer') + outer2 = other.join(self.index, how='outer') + expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, + 16, 18], dtype=object) + self.assert_(outer.equals(outer2)) + self.assert_(outer.equals(expected)) + + inner = self.index.join(other, how='inner') + inner2 = other.join(self.index, how='inner') + expected = Index([6, 8, 10], dtype=object) + self.assert_(inner.equals(inner2)) + self.assert_(inner.equals(expected)) + + left = self.index.join(other, how='left') + self.assert_(left.equals(self.index)) + + left2 = other.join(self.index, how='left') + self.assert_(left2.equals(other)) + + right = self.index.join(other, how='right') + self.assert_(right.equals(other)) + + right2 = other.join(self.index, how='right') + self.assert_(right2.equals(self.index)) + + def test_join_non_unique(self): + left = Index([4, 4, 3, 3]) + + joined, lidx, ridx = left.join(left, return_indexers=True) + + exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4]) + self.assert_(joined.equals(exp_joined)) + + exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.int64) + self.assert_(np.array_equal(lidx, exp_lidx)) + + exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.int64) + self.assert_(np.array_equal(ridx, exp_ridx)) + + def test_intersection(self): + other = Index([1, 2, 3, 4, 5]) + result = self.index.intersection(other) + expected = np.sort(np.intersect1d(self.index.values, other.values)) + self.assert_(np.array_equal(result, expected)) + + result = other.intersection(self.index) + expected = np.sort(np.asarray(np.intersect1d(self.index.values, + other.values))) + self.assert_(np.array_equal(result, expected)) + + def test_intersect_str_dates(self): + dt_dates = [datetime(2012,2,9) , datetime(2012,2,22)] + + i1 = Index(dt_dates, dtype=object) + i2 = Index(['aa'], dtype=object) + res = i2.intersection(i1) + + self.assert_(len(res) == 0) + + def test_union_noncomparable(self): + from datetime import datetime, timedelta + # corner case, non-Int64Index + now = datetime.now() + other = Index([now + timedelta(i) for i in xrange(4)], dtype=object) + result = self.index.union(other) + expected = np.concatenate((self.index, other)) + self.assert_(np.array_equal(result, expected)) + + result = other.union(self.index) + expected = np.concatenate((other, self.index)) + self.assert_(np.array_equal(result, expected)) + + def test_cant_or_shouldnt_cast(self): + # can't + data = ['foo', 'bar', 'baz'] + self.assertRaises(TypeError, Int64Index, data) + + # shouldn't + data = ['0', '1', '2'] + self.assertRaises(TypeError, Int64Index, data) + + def test_view_Index(self): + self.index.view(Index) + + def test_prevent_casting(self): + result = self.index.astype('O') + self.assert_(result.dtype == np.object_) + + def test_take_preserve_name(self): + index = Int64Index([1,2,3,4], name='foo') + taken = index.take([3,0,1]) + self.assertEqual(index.name, taken.name) + + def test_int_name_format(self): + from pandas import Series, DataFrame + index = Index(['a', 'b', 'c'], name=0) + s = Series(range(3), index) + df = DataFrame(range(3), index=index) + repr(s) + repr(df) + +class TestMultiIndex(unittest.TestCase): + + def setUp(self): + major_axis = Index(['foo', 'bar', 'baz', 'qux']) + minor_axis = Index(['one', 'two']) + + major_labels = np.array([0, 0, 1, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + + self.index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels], + names=['first', 'second']) + + def test_constructor_single_level(self): + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], + names=['first']) + self.assert_(isinstance(single_level, Index)) + self.assert_(not isinstance(single_level, MultiIndex)) + self.assert_(single_level.name == 'first') + + single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]]) + self.assert_(single_level.name is None) + + def test_constructor_no_levels(self): + self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) + + def test_duplicate_names(self): + self.index.names = ['foo', 'foo'] + self.assertRaises(Exception, self.index._get_level_number, 'foo') + + def test_get_level_number_integer(self): + self.index.names = [1, 0] + self.assertEqual(self.index._get_level_number(1), 0) + self.assertEqual(self.index._get_level_number(0), 1) + self.assertRaises(Exception, self.index._get_level_number, 2) + + self.assertRaises(Exception, self.index._get_level_number, 'fourth') + + def test_from_arrays(self): + arrays = [] + for lev, lab in zip(self.index.levels, self.index.labels): + arrays.append(np.asarray(lev).take(lab)) + + result = MultiIndex.from_arrays(arrays) + self.assertEquals(list(result), list(self.index)) + + def test_append(self): + result = self.index[:3].append(self.index[3:]) + self.assert_(result.equals(self.index)) + + foos = [self.index[:1], self.index[1:3], self.index[3:]] + result = foos[0].append(foos[1:]) + self.assert_(result.equals(self.index)) + + # empty + result = self.index.append([]) + self.assert_(result.equals(self.index)) + + def test_get_level_values(self): + result = self.index.get_level_values(0) + expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux'] + self.assert_(np.array_equal(result, expected)) + + result = self.index.get_level_values('first') + expected = self.index.get_level_values(0) + self.assert_(np.array_equal(result, expected)) + + def test_reorder_levels(self): + # this blows up + self.assertRaises(Exception, self.index.reorder_levels, + [2, 1, 0]) + + def test_nlevels(self): + self.assertEquals(self.index.nlevels, 2) + + def test_iter(self): + result = list(self.index) + expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'), + ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] + self.assert_(result == expected) + + def test_pickle(self): + pickled = pickle.dumps(self.index) + unpickled = pickle.loads(pickled) + self.assert_(self.index.equals(unpickled)) + + def test_legacy_pickle(self): + if py3compat.PY3: + raise nose.SkipTest + + def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle') + obj = pickle.load(open(ppath, 'r')) + + self.assert_(obj._is_v1) + + obj2 = MultiIndex.from_tuples(obj.values) + self.assert_(obj.equals(obj2)) + + res = obj.get_indexer(obj) + exp = np.arange(len(obj)) + assert_almost_equal(res, exp) + + res = obj.get_indexer(obj2[::-1]) + exp = obj.get_indexer(obj[::-1]) + exp2 = obj2.get_indexer(obj2[::-1]) + assert_almost_equal(res, exp) + assert_almost_equal(exp, exp2) + + def test_legacy_v2_unpickle(self): + # 0.7.3 -> 0.8.0 format manage + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'mindex_073.pickle') + + obj = com.load(filepath) + + obj2 = MultiIndex.from_tuples(obj.values) + self.assert_(obj.equals(obj2)) + + res = obj.get_indexer(obj) + exp = np.arange(len(obj)) + assert_almost_equal(res, exp) + + res = obj.get_indexer(obj2[::-1]) + exp = obj.get_indexer(obj[::-1]) + exp2 = obj2.get_indexer(obj2[::-1]) + assert_almost_equal(res, exp) + assert_almost_equal(exp, exp2) + + def test_from_tuples_index_values(self): + result = MultiIndex.from_tuples(self.index) + self.assert_((result.values == self.index.values).all()) + + def test_contains(self): + self.assert_(('foo', 'two') in self.index) + self.assert_(('bar', 'two') not in self.index) + self.assert_(None not in self.index) + + def test_is_all_dates(self): + self.assert_(not self.index.is_all_dates) + + def test_is_numeric(self): + # MultiIndex is never numeric + self.assert_(not self.index.is_numeric()) + + def test_getitem(self): + # scalar + self.assertEquals(self.index[2], ('bar', 'one')) + + # slice + result = self.index[2:5] + expected = self.index[[2,3,4]] + self.assert_(result.equals(expected)) + + # boolean + result = self.index[[True, False, True, False, True, True]] + result2 = self.index[np.array([True, False, True, False, True, True])] + expected = self.index[[0, 2, 4, 5]] + self.assert_(result.equals(expected)) + self.assert_(result2.equals(expected)) + + def test_getitem_group_select(self): + sorted_idx, _ = self.index.sortlevel(0) + self.assertEquals(sorted_idx.get_loc('baz'), slice(3, 4)) + self.assertEquals(sorted_idx.get_loc('foo'), slice(0, 2)) + + def test_get_loc(self): + self.assert_(self.index.get_loc(('foo', 'two')) == 1) + self.assert_(self.index.get_loc(('baz', 'two')) == 3) + self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two')) + self.assertRaises(KeyError, self.index.get_loc, 'quux') + + # 3 levels + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + self.assertRaises(KeyError, index.get_loc, (1, 1)) + self.assert_(index.get_loc((2, 0)) == slice(3, 5)) + + def test_get_loc_duplicates(self): + index = Index([2, 2, 2, 2]) + result = index.get_loc(2) + expected = slice(0, 4) + assert(result == expected) + # self.assertRaises(Exception, index.get_loc, 2) + + def test_get_loc_level(self): + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + loc, new_index = index.get_loc_level((0, 1)) + expected = slice(1, 2) + exp_index = index[expected].droplevel(0).droplevel(0) + self.assertEqual(loc, expected) + self.assert_(new_index.equals(exp_index)) + + loc, new_index = index.get_loc_level((0, 1, 0)) + expected = 1 + self.assertEqual(loc, expected) + self.assert_(new_index is None) + + self.assertRaises(KeyError, index.get_loc_level, (2, 2)) + + index = MultiIndex(levels=[[2000], range(4)], + labels=[np.array([0, 0, 0, 0]), + np.array([0, 1, 2, 3])]) + result, new_index = index.get_loc_level((2000, slice(None, None))) + expected = slice(None, None) + self.assertEqual(result, expected) + self.assert_(new_index.equals(index.droplevel(0))) + + def test_slice_locs(self): + df = tm.makeTimeDataFrame() + stacked = df.stack() + + idx = stacked.index + + slob = slice(*idx.slice_locs(df.index[5], df.index[15])) + sliced = stacked[slob] + expected = df[5:16].stack() + tm.assert_almost_equal(sliced.values, expected.values) + + slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30), + df.index[15] - timedelta(seconds=30))) + sliced = stacked[slob] + expected = df[6:15].stack() + tm.assert_almost_equal(sliced.values, expected.values) + + def test_slice_locs_not_sorted(self): + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + self.assertRaises(Exception, index.slice_locs, (1, 0, 1), + (2, 1, 0)) + + # works + sorted_index, _ = index.sortlevel(0) + result = sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) + + def test_slice_locs_partial(self): + sorted_idx, _ = self.index.sortlevel(0) + + result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one')) + self.assertEquals(result, (1, 5)) + + result = sorted_idx.slice_locs(None, ('qux', 'one')) + self.assertEquals(result, (0, 5)) + + result = sorted_idx.slice_locs(('foo', 'two'), None) + self.assertEquals(result, (1, len(sorted_idx))) + + result = sorted_idx.slice_locs('bar', 'baz') + self.assertEquals(result, (2, 4)) + + def test_slice_locs_not_contained(self): + # some searchsorted action + + index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]], + labels=[[0, 0, 0, 1, 1, 2, 3, 3, 3], + [0, 1, 2, 1, 2, 2, 0, 1, 2]], + sortorder=0) + + result = index.slice_locs((1, 0), (5, 2)) + self.assertEquals(result, (3, 6)) + + result = index.slice_locs(1, 5) + self.assertEquals(result, (3, 6)) + + result = index.slice_locs((2, 2), (5, 2)) + self.assertEquals(result, (3, 6)) + + result = index.slice_locs(2, 5) + self.assertEquals(result, (3, 6)) + + result = index.slice_locs((1, 0), (6, 3)) + self.assertEquals(result, (3, 8)) + + result = index.slice_locs(-1, 10) + self.assertEquals(result, (0, len(index))) + + def test_consistency(self): + # need to construct an overflow + major_axis = range(70000) + minor_axis = range(10) + + major_labels = np.arange(70000) + minor_labels = np.repeat(range(10), 7000) + + # the fact that is works means it's consistent + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + + # inconsistent + major_labels = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + + self.assert_(not index.is_unique) + + def test_truncate(self): + major_axis = Index(range(4)) + minor_axis = Index(range(2)) + + major_labels = np.array([0, 0, 1, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + + result = index.truncate(before=1) + self.assert_('foo' not in result.levels[0]) + self.assert_(1 in result.levels[0]) + + result = index.truncate(after=1) + self.assert_(2 not in result.levels[0]) + self.assert_(1 in result.levels[0]) + + result = index.truncate(before=1, after=2) + self.assertEqual(len(result.levels[0]), 2) + + # after < before + self.assertRaises(ValueError, index.truncate, 3, 1) + + def test_get_indexer(self): + major_axis = Index(range(4)) + minor_axis = Index(range(2)) + + major_labels = np.array([0, 0, 1, 2, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 0, 1, 0, 1]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + idx1 = index[:5] + idx2 = index[[1,3,5]] + + r1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, [1, 3, -1]) + + r1 = idx2.get_indexer(idx1, method='pad') + assert_almost_equal(r1, [-1, 0, 0, 1, 1]) + + rffill1 = idx2.get_indexer(idx1, method='ffill') + assert_almost_equal(r1, rffill1) + + r1 = idx2.get_indexer(idx1, method='backfill') + assert_almost_equal(r1, [0, 0, 1, 1, 2]) + + rbfill1 = idx2.get_indexer(idx1, method='bfill') + assert_almost_equal(r1, rbfill1) + + # pass non-MultiIndex + r1 = idx1.get_indexer(idx2._tuple_index) + rexp1 = idx1.get_indexer(idx2) + assert_almost_equal(r1, rexp1) + + r1 = idx1.get_indexer([1,2,3]) + self.assert_( (r1 == [-1, -1, -1]).all() ) + + # self.assertRaises(Exception, idx1.get_indexer, + # list(list(zip(*idx2._tuple_index))[0])) + + def test_format(self): + self.index.format() + self.index[:0].format() + + def test_format_integer_names(self): + index = MultiIndex(levels=[[0, 1], [0, 1]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=[0, 1]) + index.format(names=True) + + def test_bounds(self): + self.index._bounds + + def test_equals(self): + self.assert_(self.index.equals(self.index)) + self.assert_(self.index.equal_levels(self.index)) + + self.assert_(not self.index.equals(self.index[:-1])) + + self.assert_(self.index.equals(self.index._tuple_index)) + + # different number of levels + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])]) + + index2 = MultiIndex(levels=index.levels[:-1], + labels=index.labels[:-1]) + self.assert_(not index.equals(index2)) + self.assert_(not index.equal_levels(index2)) + + # levels are different + major_axis = Index(range(4)) + minor_axis = Index(range(2)) + + major_labels = np.array([0, 0, 1, 2, 2, 3]) + minor_labels = np.array([0, 1, 0, 0, 1, 0]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + self.assert_(not self.index.equals(index)) + self.assert_(not self.index.equal_levels(index)) + + # some of the labels are different + major_axis = Index(['foo', 'bar', 'baz', 'qux']) + minor_axis = Index(['one', 'two']) + + major_labels = np.array([0, 0, 2, 2, 3, 3]) + minor_labels = np.array([0, 1, 0, 1, 0, 1]) + + index = MultiIndex(levels=[major_axis, minor_axis], + labels=[major_labels, minor_labels]) + self.assert_(not self.index.equals(index)) + + def test_union(self): + piece1 = self.index[:5][::-1] + piece2 = self.index[3:] + + the_union = piece1 | piece2 + + tups = sorted(self.index._tuple_index) + expected = MultiIndex.from_tuples(tups) + + self.assert_(the_union.equals(expected)) + + # corner case, pass self or empty thing: + the_union = self.index.union(self.index) + self.assert_(the_union is self.index) + + the_union = self.index.union(self.index[:0]) + self.assert_(the_union is self.index) + + # won't work in python 3 + # tuples = self.index._tuple_index + # result = self.index[:4] | tuples[4:] + # self.assert_(result.equals(tuples)) + + # not valid for python 3 + # def test_union_with_regular_index(self): + # other = Index(['A', 'B', 'C']) + + # result = other.union(self.index) + # self.assert_(('foo', 'one') in result) + # self.assert_('B' in result) + + # result2 = self.index.union(other) + # self.assert_(result.equals(result2)) + + def test_intersection(self): + piece1 = self.index[:5][::-1] + piece2 = self.index[3:] + + the_int = piece1 & piece2 + tups = sorted(self.index[3:5]._tuple_index) + expected = MultiIndex.from_tuples(tups) + self.assert_(the_int.equals(expected)) + + # corner case, pass self + the_int = self.index.intersection(self.index) + self.assert_(the_int is self.index) + + # empty intersection: disjoint + empty = self.index[:2] & self.index[2:] + expected = self.index[:0] + self.assert_(empty.equals(expected)) + + # can't do in python 3 + # tuples = self.index._tuple_index + # result = self.index & tuples + # self.assert_(result.equals(tuples)) + + def test_diff(self): + first = self.index + result = first - self.index[-3:] + expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), + sortorder=0, + names=self.index.names) + + self.assert_(isinstance(result, MultiIndex)) + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: reflexive + result = self.index - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: superset + result = self.index[-3:] - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # empty difference: degenerate + result = self.index[:0] - self.index + expected = self.index[:0] + self.assert_(result.equals(expected)) + self.assertEqual(result.names, self.index.names) + + # names not the same + chunklet = self.index[-3:] + chunklet.names = ['foo', 'baz'] + result = first - chunklet + self.assertEqual(result.names, [None, None]) + + # empty, but non-equal + result = self.index - self.index.sortlevel(1)[0] + self.assert_(len(result) == 0) + + # raise Exception called with non-MultiIndex + self.assertRaises(Exception, first.diff, first._tuple_index) + + def test_from_tuples(self): + self.assertRaises(Exception, MultiIndex.from_tuples, []) + + idx = MultiIndex.from_tuples( ((1,2),(3,4)), names=['a', 'b'] ) + self.assertEquals(len(idx), 2) + + def test_argsort(self): + result = self.index.argsort() + expected = self.index._tuple_index.argsort() + self.assert_(np.array_equal(result, expected)) + + def test_sortlevel(self): + import random + + tuples = list(self.index) + random.shuffle(tuples) + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + self.assert_(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + self.assert_(sorted_idx.equals(expected[::-1])) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + self.assert_(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + self.assert_(sorted_idx.equals(expected[::-1])) + + def test_sortlevel_deterministic(self): + tuples = [('bar', 'one'), ('foo', 'two'), ('qux', 'two'), + ('foo', 'one'), ('baz', 'two'), ('qux', 'one')] + + index = MultiIndex.from_tuples(tuples) + + sorted_idx, _ = index.sortlevel(0) + expected = MultiIndex.from_tuples(sorted(tuples)) + self.assert_(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(0, ascending=False) + self.assert_(sorted_idx.equals(expected[::-1])) + + sorted_idx, _ = index.sortlevel(1) + by1 = sorted(tuples, key=lambda x: (x[1], x[0])) + expected = MultiIndex.from_tuples(by1) + self.assert_(sorted_idx.equals(expected)) + + sorted_idx, _ = index.sortlevel(1, ascending=False) + self.assert_(sorted_idx.equals(expected[::-1])) + + + def test_dims(self): + pass + + def test_drop(self): + dropped = self.index.drop([('foo', 'two'), ('qux', 'one')]) + + index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) + dropped2 = self.index.drop(index) + + expected = self.index[[0, 2, 3, 5]] + self.assert_(dropped.equals(expected)) + self.assert_(dropped2.equals(expected)) + + dropped = self.index.drop(['bar']) + expected = self.index[[0, 1, 3, 4, 5]] + self.assert_(dropped.equals(expected)) + + index = MultiIndex.from_tuples([('bar', 'two')]) + self.assertRaises(Exception, self.index.drop, [('bar', 'two')]) + self.assertRaises(Exception, self.index.drop, index) + + # mixed partial / full drop + dropped = self.index.drop(['foo', ('qux', 'one')]) + expected = self.index[[2, 3, 5]] + self.assert_(dropped.equals(expected)) + + def test_droplevel_with_names(self): + index = self.index[self.index.get_loc('foo')] + dropped = index.droplevel(0) + self.assertEqual(dropped.name, 'second') + + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])], + names=['one', 'two', 'three']) + dropped = index.droplevel(0) + self.assertEqual(dropped.names, ['two', 'three']) + + dropped = index.droplevel('two') + expected = index.droplevel(1) + self.assert_(dropped.equals(expected)) + + def test_droplevel_multiple(self): + index = MultiIndex(levels=[Index(range(4)), + Index(range(4)), + Index(range(4))], + labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), + np.array([0, 1, 0, 0, 0, 1, 0, 1]), + np.array([1, 0, 1, 1, 0, 0, 1, 0])], + names=['one', 'two', 'three']) + + dropped = index[:2].droplevel(['three', 'one']) + expected = index[:2].droplevel(2).droplevel(0) + self.assert_(dropped.equals(expected)) + + def test_insert(self): + # key contained in all levels + new_index = self.index.insert(0, ('bar', 'two')) + self.assert_(new_index.equal_levels(self.index)) + self.assert_(new_index[0] == ('bar', 'two')) + + # key not contained in all levels + new_index = self.index.insert(0, ('abc', 'three')) + self.assert_(np.array_equal(new_index.levels[0], + list(self.index.levels[0]) + ['abc'])) + self.assert_(np.array_equal(new_index.levels[1], + list(self.index.levels[1]) + ['three'])) + self.assert_(new_index[0] == ('abc', 'three')) + + # key wrong length + self.assertRaises(Exception, self.index.insert, 0, ('foo2',)) + + def test_take_preserve_name(self): + taken = self.index.take([3,0,1]) + self.assertEqual(taken.names, self.index.names) + + def test_join_level(self): + def _check_how(other, how): + join_index, lidx, ridx = other.join(self.index, how=how, + level='second', + return_indexers=True) + + exp_level = other.join(self.index.levels[1], how=how) + self.assert_(join_index.levels[0].equals(self.index.levels[0])) + self.assert_(join_index.levels[1].equals(exp_level)) + + # pare down levels + mask = np.array([x[1] in exp_level for x in self.index], dtype=bool) + exp_values = self.index.values[mask] + self.assert_(np.array_equal(join_index.values, exp_values)) + + if how in ('outer', 'inner'): + join_index2, ridx2, lidx2 = \ + self.index.join(other, how=how, level='second', + return_indexers=True) + + self.assert_(join_index.equals(join_index2)) + self.assert_(np.array_equal(lidx, lidx2)) + self.assert_(np.array_equal(ridx, ridx2)) + self.assert_(np.array_equal(join_index2.values, exp_values)) + + def _check_all(other): + _check_how(other, 'outer') + _check_how(other, 'inner') + _check_how(other, 'left') + _check_how(other, 'right') + + _check_all(Index(['three', 'one', 'two'])) + _check_all(Index(['one'])) + _check_all(Index(['one', 'three'])) + + # some corner cases + idx = Index(['three', 'one', 'two']) + result = idx.join(self.index, level='second') + self.assert_(isinstance(result, MultiIndex)) + + self.assertRaises(Exception, self.index.join, self.index, level=1) + + def test_reindex(self): + result, indexer = self.index.reindex(list(self.index[:4])) + self.assert_(isinstance(result, MultiIndex)) + + result, indexer = self.index.reindex(list(self.index)) + self.assert_(isinstance(result, MultiIndex)) + self.assert_(indexer is None) + + def test_reindex_level(self): + idx = Index(['one']) + + target, indexer = self.index.reindex(idx, level='second') + target2, indexer2 = idx.reindex(self.index, level='second') + + exp_index = self.index.join(idx, level='second', how='right') + exp_index2 = self.index.join(idx, level='second', how='left') + + self.assert_(target.equals(exp_index)) + exp_indexer = np.array([0, 2, 4]) + self.assert_(np.array_equal(indexer, exp_indexer)) + + self.assert_(target2.equals(exp_index2)) + exp_indexer2 = np.array([0, -1, 0, -1, 0, -1]) + self.assert_(np.array_equal(indexer2, exp_indexer2)) + + self.assertRaises(ValueError, self.index.reindex, + self.index, method='pad', level='second') + + self.assertRaises(ValueError, idx.reindex, + idx, method='bfill', level='first') + + def test_has_duplicates(self): + self.assert_(not self.index.has_duplicates) + self.assert_(self.index.append(self.index).has_duplicates) + + index = MultiIndex(levels=[[0, 1], [0, 1, 2]], + labels=[[0, 0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 0, 1, 2]]) + self.assert_(index.has_duplicates) + + + +def test_get_combined_index(): + from pandas.core.index import _get_combined_index + result = _get_combined_index([]) + assert(result.equals(Index([]))) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + # '--with-coverage', '--cover-package=pandas.core'], + exit=False) + + diff --git a/pandas/tests/test_internals.py b/pandas/tests/test_internals.py new file mode 100644 index 00000000..03f3e14f --- /dev/null +++ b/pandas/tests/test_internals.py @@ -0,0 +1,428 @@ +# pylint: disable=W0102 + +import unittest + +import numpy as np + +from pandas import Index, MultiIndex, DataFrame, Series +from pandas.core.internals import * +import pandas.core.internals as internals +import pandas.util.testing as tm + +from pandas.util.testing import (assert_almost_equal, assert_frame_equal, randn) + +def assert_block_equal(left, right): + assert_almost_equal(left.values, right.values) + assert(left.dtype == right.dtype) + assert(left.items.equals(right.items)) + assert(left.ref_items.equals(right.ref_items)) + +def get_float_mat(n, k): + return np.repeat(np.atleast_2d(np.arange(k, dtype=float)), n, axis=0) + +TEST_COLS = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'] +N = 10 + +def get_float_ex(cols=['a', 'c', 'e']): + floats = get_float_mat(N, len(cols)).T + return make_block(floats, cols, TEST_COLS) + +def get_complex_ex(cols=['h']): + complexes = (get_float_mat(N, 1).T * 1j).astype(np.complex128) + return make_block(complexes, cols, TEST_COLS) + +def get_obj_ex(cols=['b', 'd']): + mat = np.empty((N, 2), dtype=object) + mat[:, 0] = 'foo' + mat[:, 1] = 'bar' + return make_block(mat.T, cols, TEST_COLS) + +def get_bool_ex(cols=['f']): + mat = np.ones((N, 1), dtype=bool) + return make_block(mat.T, cols, TEST_COLS) + +def get_int_ex(cols=['g']): + mat = randn(N, 1).astype(int) + return make_block(mat.T, cols, TEST_COLS) + +def get_int32_ex(cols): + mat = randn(N, 1).astype(np.int32) + return make_block(mat.T, cols, TEST_COLS) + +def get_dt_ex(cols=['h']): + mat = randn(N, 1).astype(int).astype('M8[ns]') + return make_block(mat.T, cols, TEST_COLS) + +class TestBlock(unittest.TestCase): + + def setUp(self): + self.fblock = get_float_ex() + self.cblock = get_complex_ex() + self.oblock = get_obj_ex() + self.bool_block = get_bool_ex() + self.int_block = get_int_ex() + + def test_constructor(self): + int32block = get_int32_ex(['a']) + self.assert_(int32block.dtype == np.int64) + + def test_pickle(self): + import pickle + + def _check(blk): + pickled = pickle.dumps(blk) + unpickled = pickle.loads(pickled) + assert_block_equal(blk, unpickled) + + _check(self.fblock) + _check(self.cblock) + _check(self.oblock) + _check(self.bool_block) + + def test_ref_locs(self): + assert_almost_equal(self.fblock.ref_locs, [0, 2, 4]) + + def test_attrs(self): + self.assert_(self.fblock.shape == self.fblock.values.shape) + self.assert_(self.fblock.dtype == self.fblock.values.dtype) + self.assert_(len(self.fblock) == len(self.fblock.values)) + + def test_merge(self): + avals = randn(2, 10) + bvals = randn(2, 10) + + ref_cols = ['e', 'a', 'b', 'd', 'f'] + + ablock = make_block(avals, ['e', 'b'], ref_cols) + bblock = make_block(bvals, ['a', 'd'], ref_cols) + merged = ablock.merge(bblock) + exvals = np.vstack((avals, bvals)) + excols = ['e', 'b', 'a', 'd'] + eblock = make_block(exvals, excols, ref_cols) + eblock = eblock.reindex_items_from(ref_cols) + assert_block_equal(merged, eblock) + + # TODO: merge with mixed type? + + def test_copy(self): + cop = self.fblock.copy() + self.assert_(cop is not self.fblock) + assert_block_equal(self.fblock, cop) + + def test_items(self): + cols = self.fblock.items + self.assert_(np.array_equal(cols, ['a', 'c', 'e'])) + + cols2 = self.fblock.items + self.assert_(cols is cols2) + + def test_assign_ref_items(self): + new_cols = Index(['foo', 'bar', 'baz', 'quux', 'hi']) + self.fblock.set_ref_items(new_cols) + self.assert_(np.array_equal(self.fblock.items, + ['foo', 'baz', 'hi'])) + + def test_reindex_index(self): + pass + + def test_reindex_items_from(self): + new_cols = Index(['e', 'b', 'c', 'f']) + reindexed = self.fblock.reindex_items_from(new_cols) + assert_almost_equal(reindexed.ref_locs, [0, 2]) + self.assertEquals(reindexed.values.shape[0], 2) + self.assert_((reindexed.values[0] == 2).all()) + self.assert_((reindexed.values[1] == 1).all()) + + def test_reindex_cast(self): + pass + + def test_insert(self): + pass + + def test_delete(self): + newb = self.fblock.delete('a') + assert_almost_equal(newb.ref_locs, [2, 4]) + self.assert_((newb.values[0] == 1).all()) + + newb = self.fblock.delete('c') + assert_almost_equal(newb.ref_locs, [0, 4]) + self.assert_((newb.values[1] == 2).all()) + + newb = self.fblock.delete('e') + assert_almost_equal(newb.ref_locs, [0, 2]) + self.assert_((newb.values[1] == 1).all()) + + self.assertRaises(Exception, self.fblock.delete, 'b') + + def test_split_block_at(self): + left, right = self.fblock.split_block_at('a') + self.assert_(left is None) + self.assert_(np.array_equal(right.items, ['c', 'e'])) + + left, right = self.fblock.split_block_at('c') + self.assert_(np.array_equal(left.items, ['a'])) + self.assert_(np.array_equal(right.items, ['e'])) + + left, right = self.fblock.split_block_at('e') + self.assert_(np.array_equal(left.items, ['a', 'c'])) + self.assert_(right is None) + + bblock = get_bool_ex(['f']) + left, right = bblock.split_block_at('f') + self.assert_(left is None) + self.assert_(right is None) + + def test_get(self): + pass + + def test_set(self): + pass + + def test_fillna(self): + pass + + def test_repr(self): + pass + + +class TestBlockManager(unittest.TestCase): + + def setUp(self): + self.blocks = [get_float_ex(), + get_obj_ex(), + get_bool_ex(), + get_int_ex(), + get_complex_ex()] + + all_items = [b.items for b in self.blocks] + + items = sorted(all_items[0].append(all_items[1:])) + items = Index(items) + for b in self.blocks: + b.ref_items = items + + self.mgr = BlockManager(self.blocks, [items, np.arange(N)]) + + def test_constructor_corner(self): + pass + + def test_attrs(self): + self.assertEquals(self.mgr.nblocks, len(self.mgr.blocks)) + self.assertEquals(len(self.mgr), len(self.mgr.items)) + + def test_is_mixed_dtype(self): + self.assert_(self.mgr.is_mixed_dtype()) + + items = Index(['a', 'b']) + blocks = [get_bool_ex(['a']), get_bool_ex(['b'])] + for b in blocks: + b.ref_items = items + + mgr = BlockManager(blocks, [items, np.arange(N)]) + self.assert_(not mgr.is_mixed_dtype()) + + def test_is_indexed_like(self): + self.assert_(self.mgr._is_indexed_like(self.mgr)) + mgr2 = self.mgr.reindex_axis(np.arange(N - 1), axis=1) + self.assert_(not self.mgr._is_indexed_like(mgr2)) + + def test_block_id_vector_item_dtypes(self): + expected = [0, 1, 0, 1, 0, 2, 3, 4] + result = self.mgr.block_id_vector + assert_almost_equal(expected, result) + + result = self.mgr.item_dtypes + expected = ['float64', 'object', 'float64', 'object', 'float64', + 'bool', 'int64', 'complex128'] + self.assert_(np.array_equal(result, expected)) + + def test_union_block_items(self): + blocks = [get_float_ex(['a', 'b', 'c']), + get_float_ex(['c', 'd', 'e'])] + self.assertRaises(Exception, internals._union_block_items, blocks) + + blocks = [get_float_ex(['a', 'b', 'c']), + get_float_ex(['f', 'e', 'd'])] + self.assert_(np.array_equal(internals._union_block_items(blocks), + ['a', 'b', 'c', 'd', 'e', 'f'])) + + def test_duplicate_item_failure(self): + items = Index(['a', 'a']) + blocks = [get_bool_ex(['a']), get_float_ex(['a'])] + for b in blocks: + b.ref_items = items + + mgr = BlockManager(blocks, [items, np.arange(N)]) + self.assertRaises(Exception, mgr.iget, 1) + + def test_contains(self): + self.assert_('a' in self.mgr) + self.assert_('baz' not in self.mgr) + + def test_pickle(self): + import pickle + + pickled = pickle.dumps(self.mgr) + mgr2 = pickle.loads(pickled) + + # same result + assert_frame_equal(DataFrame(self.mgr), DataFrame(mgr2)) + + # share ref_items + self.assert_(mgr2.blocks[0].ref_items is mgr2.blocks[1].ref_items) + + def test_get(self): + pass + + def test_get_scalar(self): + for item in self.mgr.items: + for i, index in enumerate(self.mgr.axes[1]): + res = self.mgr.get_scalar((item, index)) + exp = self.mgr.get(item)[i] + assert_almost_equal(res, exp) + + def test_set(self): + pass + + def test_set_change_dtype(self): + self.mgr.set('baz', np.zeros(N, dtype=bool)) + + self.mgr.set('baz', np.repeat('foo', N)) + self.assert_(self.mgr.get('baz').dtype == np.object_) + + mgr2 = self.mgr.consolidate() + mgr2.set('baz', np.repeat('foo', N)) + self.assert_(mgr2.get('baz').dtype == np.object_) + + mgr2.set('quux', randn(N).astype(int)) + self.assert_(mgr2.get('quux').dtype == np.int64) + + mgr2.set('quux', randn(N)) + self.assert_(mgr2.get('quux').dtype == np.float_) + + def test_copy(self): + shallow = self.mgr.copy(deep=False) + + for cp_blk, blk in zip(shallow.blocks, self.mgr.blocks): + self.assert_(cp_blk.values is blk.values) + + def test_as_matrix(self): + pass + + def test_as_matrix_int_bool(self): + items = Index(['a', 'b']) + + blocks = [get_bool_ex(['a']), get_bool_ex(['b'])] + for b in blocks: + b.ref_items = items + index_sz = blocks[0].values.shape[1] + mgr = BlockManager(blocks, [items, np.arange(index_sz)]) + self.assert_(mgr.as_matrix().dtype == np.bool_) + + blocks = [get_int_ex(['a']), get_int_ex(['b'])] + for b in blocks: + b.ref_items = items + + mgr = BlockManager(blocks, [items, np.arange(index_sz)]) + self.assert_(mgr.as_matrix().dtype == np.int64) + + def test_as_matrix_datetime(self): + items = Index(['h', 'g']) + blocks = [get_dt_ex(['h']), get_dt_ex(['g'])] + for b in blocks: + b.ref_items = items + + index_sz = blocks[0].values.shape[1] + mgr = BlockManager(blocks, [items, np.arange(index_sz)]) + self.assert_(mgr.as_matrix().dtype == 'M8[ns]') + + def test_xs(self): + pass + + def test_interleave(self): + pass + + def test_consolidate(self): + pass + + def test_consolidate_ordering_issues(self): + self.mgr.set('f', randn(N)) + self.mgr.set('d', randn(N)) + self.mgr.set('b', randn(N)) + self.mgr.set('g', randn(N)) + self.mgr.set('h', randn(N)) + + cons = self.mgr.consolidate() + self.assertEquals(cons.nblocks, 1) + self.assert_(cons.blocks[0].items.equals(cons.items)) + + def test_reindex_index(self): + pass + + def test_reindex_items(self): + def _check_cols(before, after, cols): + for col in cols: + assert_almost_equal(after.get(col), before.get(col)) + + # not consolidated + vals = randn(N) + self.mgr.set('g', vals) + reindexed = self.mgr.reindex_items(['g', 'c', 'a', 'd']) + self.assertEquals(reindexed.nblocks, 2) + assert_almost_equal(reindexed.get('g'), vals.squeeze()) + _check_cols(self.mgr, reindexed, ['c', 'a', 'd']) + + def test_xs(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + + self.mgr.set_axis(1, index) + + result = self.mgr.xs('bar', axis=1) + expected = self.mgr.get_slice(slice(3, 5), axis=1) + + assert_frame_equal(DataFrame(result), DataFrame(expected)) + + def test_get_numeric_data(self): + int_ser = Series(np.array([0, 1, 2])) + float_ser = Series(np.array([0., 1., 2.])) + complex_ser = Series(np.array([0j, 1j, 2j])) + str_ser = Series(np.array(['a', 'b', 'c'])) + bool_ser = Series(np.array([True, False, True])) + obj_ser = Series(np.array([1, 'a', 5])) + dt_ser = Series(tm.makeDateIndex(3)) + #check types + df = DataFrame({'int' : int_ser, 'float' : float_ser, + 'complex' : complex_ser, 'str' : str_ser, + 'bool' : bool_ser, 'obj' : obj_ser, + 'dt' : dt_ser}) + xp = DataFrame({'int' : int_ser, 'float' : float_ser, + 'complex' : complex_ser}) + rs = DataFrame(df._data.get_numeric_data()) + assert_frame_equal(xp, rs) + + xp = DataFrame({'bool' : bool_ser}) + rs = DataFrame(df._data.get_numeric_data(type_list=bool)) + assert_frame_equal(xp, rs) + + rs = DataFrame(df._data.get_numeric_data(type_list=bool)) + df.ix[0, 'bool'] = not df.ix[0, 'bool'] + + self.assertEqual(rs.ix[0, 'bool'], df.ix[0, 'bool']) + + rs = DataFrame(df._data.get_numeric_data(type_list=bool, copy=True)) + df.ix[0, 'bool'] = not df.ix[0, 'bool'] + + self.assertEqual(rs.ix[0, 'bool'], not df.ix[0, 'bool']) + +if __name__ == '__main__': + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--pdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py new file mode 100644 index 00000000..d2b27db6 --- /dev/null +++ b/pandas/tests/test_multilevel.py @@ -0,0 +1,1415 @@ +# pylint: disable-msg=W0612,E1101,W0141 +from pandas.util.py3compat import StringIO +import nose +import unittest + +from numpy.random import randn +import numpy as np + +from pandas.core.index import Index, MultiIndex +from pandas import Panel, DataFrame, Series, notnull, isnull + +from pandas.util.testing import (assert_almost_equal, + assert_series_equal, + assert_frame_equal) +import pandas.core.common as com +import pandas.util.testing as tm +from pandas.util.compat import product as cart_product + +class TestMultiLevel(unittest.TestCase): + + def setUp(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + + self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], + labels=[[0, 1, 2, 3]], + names=['first']) + + # create test series object + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + s[3] = np.NaN + self.series = s + + tm.N = 100 + self.tdf = tm.makeTimeDataFrame() + self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, + lambda x: x.day]).sum() + + # use Int64Index, to make sure things work + self.ymd.index.levels = [lev.astype('i8') + for lev in self.ymd.index.levels] + self.ymd.index.names = ['year', 'month', 'day'] + + def test_append(self): + a, b = self.frame[:5], self.frame[5:] + + result = a.append(b) + tm.assert_frame_equal(result, self.frame) + + result = a['A'].append(b['A']) + tm.assert_series_equal(result, self.frame['A']) + + def test_dataframe_constructor(self): + multi = DataFrame(np.random.randn(4, 4), + index=[np.array(['a', 'a', 'b', 'b']), + np.array(['x', 'y', 'x', 'y'])]) + self.assert_(isinstance(multi.index, MultiIndex)) + self.assert_(not isinstance(multi.columns, MultiIndex)) + + multi = DataFrame(np.random.randn(4, 4), + columns=[['a', 'a', 'b', 'b'], + ['x', 'y', 'x', 'y']]) + self.assert_(isinstance(multi.columns, MultiIndex)) + + def test_series_constructor(self): + multi = Series(1., index=[np.array(['a', 'a', 'b', 'b']), + np.array(['x', 'y', 'x', 'y'])]) + self.assert_(isinstance(multi.index, MultiIndex)) + + multi = Series(1., index=[['a', 'a', 'b', 'b'], + ['x', 'y', 'x', 'y']]) + self.assert_(isinstance(multi.index, MultiIndex)) + + multi = Series(range(4), index=[['a', 'a', 'b', 'b'], + ['x', 'y', 'x', 'y']]) + self.assert_(isinstance(multi.index, MultiIndex)) + + def test_reindex_level(self): + # axis=0 + month_sums = self.ymd.sum(level='month') + result = month_sums.reindex(self.ymd.index, level=1) + expected = self.ymd.groupby(level='month').transform(np.sum) + + assert_frame_equal(result, expected) + + # Series + result = month_sums['A'].reindex(self.ymd.index, level=1) + expected = self.ymd['A'].groupby(level='month').transform(np.sum) + assert_series_equal(result, expected) + + # axis=1 + month_sums = self.ymd.T.sum(axis=1, level='month') + result = month_sums.reindex(columns=self.ymd.index, level=1) + expected = self.ymd.groupby(level='month').transform(np.sum).T + assert_frame_equal(result, expected) + + def test_binops_level(self): + def _check_op(opname): + op = getattr(DataFrame, opname) + month_sums = self.ymd.sum(level='month') + result = op(self.ymd, month_sums, level='month') + + broadcasted = self.ymd.groupby(level='month').transform(np.sum) + expected = op(self.ymd, broadcasted) + assert_frame_equal(result, expected) + + # Series + op = getattr(Series, opname) + result = op(self.ymd['A'], month_sums['A'], level='month') + broadcasted = self.ymd['A'].groupby(level='month').transform(np.sum) + expected = op(self.ymd['A'], broadcasted) + assert_series_equal(result, expected) + + _check_op('sub') + _check_op('add') + _check_op('mul') + _check_op('div') + + def test_pickle(self): + import cPickle + def _test_roundtrip(frame): + pickled = cPickle.dumps(frame) + unpickled = cPickle.loads(pickled) + assert_frame_equal(frame, unpickled) + + _test_roundtrip(self.frame) + _test_roundtrip(self.frame.T) + _test_roundtrip(self.ymd) + _test_roundtrip(self.ymd.T) + + def test_reindex(self): + reindexed = self.frame.ix[[('foo', 'one'), ('bar', 'one')]] + expected = self.frame.ix[[0, 3]] + assert_frame_equal(reindexed, expected) + + def test_reindex_preserve_levels(self): + new_index = self.ymd.index[::10] + chunk = self.ymd.reindex(new_index) + self.assert_(chunk.index is new_index) + + chunk = self.ymd.ix[new_index] + self.assert_(chunk.index is new_index) + + ymdT = self.ymd.T + chunk = ymdT.reindex(columns=new_index) + self.assert_(chunk.columns is new_index) + + chunk = ymdT.ix[:, new_index] + self.assert_(chunk.columns is new_index) + + def test_sort_index_preserve_levels(self): + result = self.frame.sort_index() + self.assertEquals(result.index.names, self.frame.index.names) + + def test_repr_to_string(self): + repr(self.frame) + repr(self.ymd) + repr(self.frame.T) + repr(self.ymd.T) + + buf = StringIO() + self.frame.to_string(buf=buf) + self.ymd.to_string(buf=buf) + self.frame.T.to_string(buf=buf) + self.ymd.T.to_string(buf=buf) + + def test_repr_name_coincide(self): + index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], + names=['a', 'b', 'c']) + + df = DataFrame({'value': [0, 1]}, index=index) + + lines = repr(df).split('\n') + self.assert_(lines[2].startswith('a 0 foo')) + + def test_getitem_simple(self): + df = self.frame.T + + col = df['foo', 'one'] + assert_almost_equal(col.values, df.values[:, 0]) + self.assertRaises(KeyError, df.__getitem__, ('foo', 'four')) + self.assertRaises(KeyError, df.__getitem__, 'foobar') + + def test_series_getitem(self): + s = self.ymd['A'] + + result = s[2000, 3] + result2 = s.ix[2000, 3] + expected = s.reindex(s.index[42:65]) + expected.index = expected.index.droplevel(0).droplevel(0) + assert_series_equal(result, expected) + + result = s[2000, 3, 10] + expected = s[49] + self.assertEquals(result, expected) + + # fancy + result = s.ix[[(2000, 3, 10), (2000, 3, 13)]] + expected = s.reindex(s.index[49:51]) + assert_series_equal(result, expected) + + # key error + self.assertRaises(KeyError, s.__getitem__, (2000, 3, 4)) + + def test_series_getitem_corner(self): + s = self.ymd['A'] + + # don't segfault, GH #495 + # out of bounds access + self.assertRaises(IndexError, s.__getitem__, len(self.ymd)) + + # generator + result = s[(x > 0 for x in s)] + expected = s[s > 0] + assert_series_equal(result, expected) + + def test_series_setitem(self): + s = self.ymd['A'] + + s[2000, 3] = np.nan + self.assert_(isnull(s.values[42:65]).all()) + self.assert_(notnull(s.values[:42]).all()) + self.assert_(notnull(s.values[65:]).all()) + + s[2000, 3, 10] = np.nan + self.assert_(isnull(s[49])) + + def test_series_slice_partial(self): + pass + + def test_frame_getitem_setitem_slice(self): + # getitem + result = self.frame.ix[:4] + expected = self.frame[:4] + assert_frame_equal(result, expected) + + # setitem + cp = self.frame.copy() + cp.ix[:4] = 0 + + self.assert_((cp.values[:4] == 0).all()) + self.assert_((cp.values[4:] != 0).all()) + + def test_frame_getitem_setitem_multislice(self): + levels = [['t1', 't2'], ['a','b','c']] + labels = [[0,0,0,1,1], [0,1,2,0,1]] + midx = MultiIndex(labels=labels, levels=levels, names=[None, 'id']) + df = DataFrame({'value':[1,2,3,7,8]}, index=midx) + + result = df.ix[:,'value'] + assert_series_equal(df['value'], result) + + result = df.ix[1:3,'value'] + assert_series_equal(df['value'][1:3], result) + + result = df.ix[:,:] + assert_frame_equal(df, result) + + result = df + df.ix[:, 'value'] = 10 + result['value'] = 10 + assert_frame_equal(df, result) + + df.ix[:,:] = 10 + assert_frame_equal(df, result) + + def test_getitem_tuple_plus_slice(self): + # GH #671 + df = DataFrame({'a' : range(10), + 'b' : range(10), + 'c' : np.random.randn(10), + 'd' : np.random.randn(10)}) + + idf = df.set_index(['a', 'b']) + + result = idf.ix[(0, 0), :] + expected = idf.ix[0, 0] + expected2 = idf.xs((0, 0)) + + assert_series_equal(result, expected) + assert_series_equal(result, expected2) + + def test_getitem_setitem_tuple_plus_columns(self): + # GH #1013 + + df = self.ymd[:5] + + result = df.ix[(2000, 1, 6), ['A', 'B', 'C']] + expected = df.ix[2000, 1, 6][['A', 'B', 'C']] + assert_series_equal(result, expected) + + def test_xs(self): + xs = self.frame.xs(('bar', 'two')) + xs2 = self.frame.ix[('bar', 'two')] + + assert_series_equal(xs, xs2) + assert_almost_equal(xs.values, self.frame.values[4]) + + def test_xs_partial(self): + result = self.frame.xs('foo') + result2 = self.frame.ix['foo'] + expected = self.frame.T['foo'].T + assert_frame_equal(result, expected) + assert_frame_equal(result, result2) + + def test_xs_level(self): + result = self.frame.xs('two', level='second') + expected = self.frame[self.frame.index.get_level_values(1) == 'two'] + expected.index = expected.index.droplevel(1) + + assert_frame_equal(result, expected) + + index = MultiIndex.from_tuples([('x', 'y', 'z'), ('a', 'b', 'c'), + ('p', 'q', 'r')]) + df = DataFrame(np.random.randn(3, 5), index=index) + result = df.xs('c', level=2) + expected = df[1:2] + expected.index = expected.index.droplevel(2) + assert_frame_equal(result, expected) + + def test_xs_level_multiple(self): + from pandas import read_table + from StringIO import StringIO + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_table(StringIO(text), sep='\s+') + + result = df.xs(('a', 4), level=['one', 'four']) + expected = df.xs('a').xs(4, level='four') + assert_frame_equal(result, expected) + + def test_xs_level0(self): + from pandas import read_table + from StringIO import StringIO + text = """ A B C D E +one two three four +a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640 +a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744 +x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838""" + + df = read_table(StringIO(text), sep='\s+') + + result = df.xs('a', level=0) + expected = df.xs('a') + self.assertEqual(len(result), 2) + assert_frame_equal(result, expected) + + def test_xs_level_series(self): + s = self.frame['A'] + result = s[:, 'two'] + expected = self.frame.xs('two', level=1)['A'] + assert_series_equal(result, expected) + + s = self.ymd['A'] + result = s[2000, 5] + expected = self.ymd.ix[2000, 5]['A'] + assert_series_equal(result, expected) + + # not implementing this for now + + self.assertRaises(TypeError, s.__getitem__, (2000, slice(3, 4))) + + # result = s[2000, 3:4] + # lv =s.index.get_level_values(1) + # expected = s[(lv == 3) | (lv == 4)] + # expected.index = expected.index.droplevel(0) + # assert_series_equal(result, expected) + + # can do this though + + def test_get_loc_single_level(self): + s = Series(np.random.randn(len(self.single_level)), + index=self.single_level) + for k in self.single_level.values: + s[k] + + def test_getitem_toplevel(self): + df = self.frame.T + + result = df['foo'] + expected = df.reindex(columns=df.columns[:3]) + expected.columns = expected.columns.droplevel(0) + assert_frame_equal(result, expected) + + result = df['bar'] + result2 = df.ix[:, 'bar'] + + expected = df.reindex(columns=df.columns[3:5]) + expected.columns = expected.columns.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result, result2) + + def test_getitem_setitem_slice_integers(self): + index = MultiIndex(levels=[[0, 1, 2], [0, 2]], + labels=[[0, 0, 1, 1, 2, 2], + [0, 1, 0, 1, 0, 1]]) + + frame = DataFrame(np.random.randn(len(index), 4), index=index, + columns=['a', 'b', 'c', 'd']) + res = frame.ix[1:2] + exp = frame.reindex(frame.index[2:]) + assert_frame_equal(res, exp) + + frame.ix[1:2] = 7 + self.assert_((frame.ix[1:2] == 7).values.all()) + + series = Series(np.random.randn(len(index)), index=index) + + res = series.ix[1:2] + exp = series.reindex(series.index[2:]) + assert_series_equal(res, exp) + + series.ix[1:2] = 7 + self.assert_((series.ix[1:2] == 7).values.all()) + + def test_getitem_int(self): + levels = [[0, 1], [0, 1, 2]] + labels = [[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]] + index = MultiIndex(levels=levels, labels=labels) + + frame = DataFrame(np.random.randn(6, 2), index=index) + + result = frame.ix[1] + expected = frame[-3:] + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + + # raises exception + self.assertRaises(KeyError, frame.ix.__getitem__, 3) + + # however this will work + result = self.frame.ix[2] + expected = self.frame.xs(self.frame.index[2]) + assert_series_equal(result, expected) + + def test_getitem_partial(self): + ymd = self.ymd.T + result = ymd[2000, 2] + + expected = ymd.reindex(columns=ymd.columns[ymd.columns.labels[1] == 1]) + expected.columns = expected.columns.droplevel(0).droplevel(0) + assert_frame_equal(result, expected) + + def test_getitem_slice_not_sorted(self): + df = self.frame.sortlevel(1).T + + # buglet with int typechecking + result = df.ix[:, :np.int32(3)] + expected = df.reindex(columns=df.columns[:3]) + assert_frame_equal(result, expected) + + def test_setitem_change_dtype(self): + dft = self.frame.T + s = dft['foo', 'two'] + dft['foo', 'two'] = s > s.median() + assert_series_equal(dft['foo', 'two'], s > s.median()) + self.assert_(isinstance(dft._data.blocks[1].items, MultiIndex)) + + reindexed = dft.reindex(columns=[('foo', 'two')]) + assert_series_equal(reindexed['foo', 'two'], s > s.median()) + + def test_frame_setitem_ix(self): + self.frame.ix[('bar', 'two'), 'B'] = 5 + self.assertEquals(self.frame.ix[('bar', 'two'), 'B'], 5) + + # with integer labels + df = self.frame.copy() + df.columns = range(3) + df.ix[('bar', 'two'), 1] = 7 + self.assertEquals(df.ix[('bar', 'two'), 1], 7) + + def test_fancy_slice_partial(self): + result = self.frame.ix['bar':'baz'] + expected = self.frame[3:7] + assert_frame_equal(result, expected) + + result = self.ymd.ix[(2000,2):(2000,4)] + lev = self.ymd.index.labels[1] + expected = self.ymd[(lev >= 1) & (lev <= 3)] + assert_frame_equal(result, expected) + + def test_getitem_partial_column_select(self): + idx = MultiIndex(labels=[[0,0,0],[0,1,1],[1,0,1]], + levels=[['a','b'],['x','y'],['p','q']]) + df = DataFrame(np.random.rand(3,2),index=idx) + + result = df.ix[('a', 'y'), :] + expected = df.ix[('a', 'y')] + assert_frame_equal(result, expected) + + result = df.ix[('a', 'y'), [1, 0]] + expected = df.ix[('a', 'y')][[1, 0]] + assert_frame_equal(result, expected) + + self.assertRaises(KeyError, df.ix.__getitem__, + (('a', 'foo'), slice(None, None))) + + def test_sortlevel(self): + df = self.frame.copy() + df.index = np.arange(len(df)) + self.assertRaises(Exception, df.sortlevel, 0) + + # axis=1 + + # series + a_sorted = self.frame['A'].sortlevel(0) + self.assertRaises(Exception, + self.frame.reset_index()['A'].sortlevel) + + # preserve names + self.assertEquals(a_sorted.index.names, self.frame.index.names) + + def test_delevel_infer_dtype(self): + tuples = [tuple for tuple in cart_product(['foo', 'bar'], + [10, 20], [1.0, 1.1])] + index = MultiIndex.from_tuples(tuples, + names=['prm0', 'prm1', 'prm2']) + df = DataFrame(np.random.randn(8,3), columns=['A', 'B', 'C'], + index=index) + deleveled = df.reset_index() + self.assert_(com.is_integer_dtype(deleveled['prm1'])) + self.assert_(com.is_float_dtype(deleveled['prm2'])) + + def test_reset_index_with_drop(self): + deleveled = self.ymd.reset_index(drop = True) + self.assertEquals(len(deleveled.columns), len(self.ymd.columns)) + + deleveled = self.series.reset_index() + self.assert_(isinstance(deleveled, DataFrame)) + self.assert_(len(deleveled.columns) == len(self.series.index.levels)+1) + + deleveled = self.series.reset_index(drop = True) + self.assert_(isinstance(deleveled, Series)) + + def test_sortlevel_by_name(self): + self.frame.index.names = ['first', 'second'] + result = self.frame.sortlevel(level='second') + expected = self.frame.sortlevel(level=1) + assert_frame_equal(result, expected) + + def test_sortlevel_mixed(self): + sorted_before = self.frame.sortlevel(1) + + df = self.frame.copy() + df['foo'] = 'bar' + sorted_after = df.sortlevel(1) + assert_frame_equal(sorted_before, sorted_after.drop(['foo'], axis=1)) + + dft = self.frame.T + sorted_before = dft.sortlevel(1, axis=1) + dft['foo', 'three'] = 'bar' + + sorted_after = dft.sortlevel(1, axis=1) + assert_frame_equal(sorted_before.drop([('foo', 'three')], axis=1), + sorted_after.drop([('foo', 'three')], axis=1)) + + def test_count_level(self): + def _check_counts(frame, axis=0): + index = frame._get_axis(axis) + for i in range(index.nlevels): + result = frame.count(axis=axis, level=i) + expected = frame.groupby(axis=axis, level=i).count(axis=axis) + expected = expected.reindex_like(result).astype('i8') + assert_frame_equal(result, expected) + + self.frame.ix[1, [1, 2]] = np.nan + self.frame.ix[7, [0, 1]] = np.nan + self.ymd.ix[1, [1, 2]] = np.nan + self.ymd.ix[7, [0, 1]] = np.nan + + _check_counts(self.frame) + _check_counts(self.ymd) + _check_counts(self.frame.T, axis=1) + _check_counts(self.ymd.T, axis=1) + + # can't call with level on regular DataFrame + df = tm.makeTimeDataFrame() + self.assertRaises(Exception, df.count, level=0) + + self.frame['D'] = 'foo' + result = self.frame.count(level=0, numeric_only=True) + assert_almost_equal(result.columns, ['A', 'B', 'C']) + + def test_count_level_series(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz'], + ['one', 'two', 'three', 'four']], + labels=[[0, 0, 0, 2, 2], + [2, 0, 1, 1, 2]]) + + s = Series(np.random.randn(len(index)), index=index) + + result = s.count(level=0) + expected = s.groupby(level=0).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + + result = s.count(level=1) + expected = s.groupby(level=1).count() + assert_series_equal(result.astype('f8'), + expected.reindex(result.index).fillna(0)) + + def test_count_level_corner(self): + s = self.frame['A'][:0] + result = s.count(level=0) + expected = Series(0, index=s.index.levels[0]) + assert_series_equal(result, expected) + + df = self.frame[:0] + result = df.count(level=0) + expected = DataFrame({}, index=s.index.levels[0], + columns=df.columns).fillna(0).astype(int) + assert_frame_equal(result, expected) + + def test_unstack(self): + # just check that it works for now + unstacked = self.ymd.unstack() + unstacked2 = unstacked.unstack() + + # test that ints work + unstacked = self.ymd.astype(int).unstack() + + def test_unstack_multiple_no_empty_columns(self): + index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0), + (1, 'baz', 1), (1, 'qux', 1)]) + + s = Series(np.random.randn(4), index=index) + + unstacked = s.unstack([1, 2]) + expected = unstacked.dropna(axis=1, how='all') + assert_frame_equal(unstacked, expected) + + def test_stack(self): + # regular roundtrip + unstacked = self.ymd.unstack() + restacked = unstacked.stack() + assert_frame_equal(restacked, self.ymd) + + unlexsorted = self.ymd.sortlevel(2) + + unstacked = unlexsorted.unstack(2) + restacked = unstacked.stack() + assert_frame_equal(restacked.sortlevel(0), self.ymd) + + unlexsorted = unlexsorted[::-1] + unstacked = unlexsorted.unstack(1) + restacked = unstacked.stack().swaplevel(1, 2) + assert_frame_equal(restacked.sortlevel(0), self.ymd) + + unlexsorted = unlexsorted.swaplevel(0, 1) + unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1) + restacked = unstacked.stack(0).swaplevel(1, 2) + assert_frame_equal(restacked.sortlevel(0), self.ymd) + + # columns unsorted + unstacked = self.ymd.unstack() + unstacked = unstacked.sort(axis=1, ascending=False) + restacked = unstacked.stack() + assert_frame_equal(restacked, self.ymd) + + # more than 2 levels in the columns + unstacked = self.ymd.unstack(1).unstack(1) + + result = unstacked.stack(1) + expected = self.ymd.unstack() + assert_frame_equal(result, expected) + + result = unstacked.stack(2) + expected = self.ymd.unstack(1) + assert_frame_equal(result, expected) + + result = unstacked.stack(0) + expected = self.ymd.stack().unstack(1).unstack(1) + assert_frame_equal(result, expected) + + # not all levels present in each echelon + unstacked = self.ymd.unstack(2).ix[:, ::3] + stacked = unstacked.stack().stack() + ymd_stacked = self.ymd.stack() + assert_series_equal(stacked, ymd_stacked.reindex(stacked.index)) + + # stack with negative number + result = self.ymd.unstack(0).stack(-2) + expected = self.ymd.unstack(0).stack(0) + + def test_stack_mixed_dtype(self): + df = self.frame.T + df['foo', 'four'] = 'foo' + df = df.sortlevel(1, axis=1) + + stacked = df.stack() + assert_series_equal(stacked['foo'], df['foo'].stack()) + self.assert_(stacked['bar'].dtype == np.float_) + + def test_unstack_bug(self): + df = DataFrame({'state': ['naive','naive','naive', + 'activ','activ','activ'], + 'exp':['a','b','b','b','a','a'], + 'barcode':[1,2,3,4,1,3], + 'v':['hi','hi','bye','bye','bye','peace'], + 'extra': np.arange(6.)}) + + result = df.groupby(['state','exp','barcode','v']).apply(len) + + unstacked = result.unstack() + restacked = unstacked.stack() + assert_series_equal(restacked, + result.reindex(restacked.index).astype(float)) + + def test_stack_unstack_preserve_names(self): + unstacked = self.frame.unstack() + self.assertEquals(unstacked.index.name, 'first') + self.assertEquals(unstacked.columns.names, ['exp', 'second']) + + restacked = unstacked.stack() + self.assertEquals(restacked.index.names, self.frame.index.names) + + def test_unstack_level_name(self): + result = self.frame.unstack('second') + expected = self.frame.unstack(level=1) + assert_frame_equal(result, expected) + + def test_stack_level_name(self): + unstacked = self.frame.unstack('second') + result = unstacked.stack('exp') + expected = self.frame.unstack().stack(0) + assert_frame_equal(result, expected) + + result = self.frame.stack('exp') + expected = self.frame.stack() + assert_series_equal(result, expected) + + def test_stack_unstack_multiple(self): + unstacked = self.ymd.unstack(['year', 'month']) + expected = self.ymd.unstack('year').unstack('month') + assert_frame_equal(unstacked, expected) + self.assertEquals(unstacked.columns.names, + expected.columns.names) + + # series + s = self.ymd['A'] + s_unstacked = s.unstack(['year', 'month']) + assert_frame_equal(s_unstacked, expected['A']) + + restacked = unstacked.stack(['year', 'month']) + restacked = restacked.swaplevel(0, 1).swaplevel(1, 2) + restacked = restacked.sortlevel(0) + + assert_frame_equal(restacked, self.ymd) + self.assertEquals(restacked.index.names, self.ymd.index.names) + + # GH #451 + unstacked = self.ymd.unstack([1, 2]) + expected = self.ymd.unstack(1).unstack(1).dropna(axis=1, how='all') + assert_frame_equal(unstacked, expected) + + unstacked = self.ymd.unstack([2, 1]) + expected = self.ymd.unstack(2).unstack(1).dropna(axis=1, how='all') + assert_frame_equal(unstacked, expected.ix[:, unstacked.columns]) + + def test_groupby_transform(self): + s = self.frame['A'] + grouper = s.index.get_level_values(0) + + grouped = s.groupby(grouper) + + applied = grouped.apply(lambda x: x * 2) + expected = grouped.transform(lambda x: x * 2) + assert_series_equal(applied.reindex(expected.index), expected) + + def test_groupby_corner(self): + midx = MultiIndex(levels=[['foo'],['bar'],['baz']], + labels=[[0],[0],[0]], names=['one','two','three']) + df = DataFrame([np.random.rand(4)], columns=['a','b','c','d'], + index=midx) + # should work + df.groupby(level='three') + + def test_join(self): + a = self.frame.ix[:5, ['A']] + b = self.frame.ix[2:, ['B', 'C']] + + joined = a.join(b, how='outer').reindex(self.frame.index) + expected = self.frame.copy() + expected.values[np.isnan(joined.values)] = np.nan + + self.assert_(not np.isnan(joined.values).all()) + + assert_frame_equal(joined, expected) + + def test_swaplevel(self): + swapped = self.frame['A'].swaplevel(0, 1) + swapped2 = self.frame['A'].swaplevel('first', 'second') + self.assert_(not swapped.index.equals(self.frame.index)) + assert_series_equal(swapped, swapped2) + + back = swapped.swaplevel(0, 1) + back2 = swapped.swaplevel('second', 'first') + self.assert_(back.index.equals(self.frame.index)) + assert_series_equal(back, back2) + + ft = self.frame.T + swapped = ft.swaplevel('first', 'second', axis=1) + exp = self.frame.swaplevel('first', 'second').T + assert_frame_equal(swapped, exp) + + def test_swaplevel_panel(self): + panel = Panel({'ItemA' : self.frame, + 'ItemB' : self.frame * 2}) + + result = panel.swaplevel(0, 1, axis='major') + expected = panel.copy() + expected.major_axis = expected.major_axis.swaplevel(0, 1) + tm.assert_panel_equal(result, expected) + + def test_reorder_levels(self): + result = self.ymd.reorder_levels(['month', 'day', 'year']) + expected = self.ymd.swaplevel(0, 1).swaplevel(1, 2) + assert_frame_equal(result, expected) + + result = self.ymd['A'].reorder_levels(['month', 'day', 'year']) + expected = self.ymd['A'].swaplevel(0, 1).swaplevel(1, 2) + assert_series_equal(result, expected) + + result = self.ymd.T.reorder_levels(['month', 'day', 'year'], axis=1) + expected = self.ymd.T.swaplevel(0, 1, axis=1).swaplevel(1, 2, axis=1) + assert_frame_equal(result, expected) + + self.assertRaises(Exception, self.ymd.index.reorder_levels, + [1, 2, 3]) + + def test_insert_index(self): + df = self.ymd[:5].T + df[2000, 1, 10] = df[2000, 1, 7] + self.assert_(isinstance(df.columns, MultiIndex)) + self.assert_((df[2000, 1, 10] == df[2000, 1, 7]).all()) + + def test_alignment(self): + x = Series(data=[1,2,3], + index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B",3)])) + + y = Series(data=[4,5,6], + index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B",3)])) + + res = x - y + exp_index = x.index.union(y.index) + exp = x.reindex(exp_index) - y.reindex(exp_index) + assert_series_equal(res, exp) + + # hit non-monotonic code path + res = x[::-1] - y[::-1] + exp_index = x.index.union(y.index) + exp = x.reindex(exp_index) - y.reindex(exp_index) + assert_series_equal(res, exp) + + def test_is_lexsorted(self): + levels = [[0, 1], [0, 1, 2]] + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 1, 2]]) + self.assert_(index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 2, 1]]) + self.assert_(not index.is_lexsorted()) + + index = MultiIndex(levels=levels, + labels=[[0, 0, 1, 0, 1, 1], + [0, 1, 0, 2, 2, 1]]) + self.assert_(not index.is_lexsorted()) + self.assert_(index.lexsort_depth == 0) + + def test_frame_getitem_view(self): + df = self.frame.T + df['foo'].values[:] = 0 + self.assert_((df['foo'].values == 0).all()) + + # but not if it's mixed-type + df['foo', 'four'] = 'foo' + df = df.sortlevel(0, axis=1) + df['foo']['one'] = 2 + self.assert_((df['foo', 'one'] == 0).all()) + + def test_frame_getitem_not_sorted(self): + df = self.frame.T + df['foo', 'four'] = 'foo' + + arrays = [np.array(x) for x in zip(*df.columns._tuple_index)] + + result = df['foo'] + result2 = df.ix[:, 'foo'] + expected = df.reindex(columns=df.columns[arrays[0] == 'foo']) + expected.columns = expected.columns.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + df = df.T + result = df.xs('foo') + result2 = df.ix['foo'] + expected = df.reindex(df.index[arrays[0] == 'foo']) + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_series_getitem_not_sorted(self): + arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], + ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + tuples = zip(*arrays) + index = MultiIndex.from_tuples(tuples) + s = Series(randn(8), index=index) + + arrays = [np.array(x) for x in zip(*index._tuple_index)] + + result = s['qux'] + result2 = s.ix['qux'] + expected = s[arrays[0] == 'qux'] + expected.index = expected.index.droplevel(0) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + def test_count(self): + frame = self.frame.copy() + frame.index.names = ['a', 'b'] + + result = frame.count(level='b') + expect = self.frame.count(level=1) + assert_frame_equal(result, expect) + + result = frame.count(level='a') + expect = self.frame.count(level=0) + assert_frame_equal(result, expect) + + series = self.series.copy() + series.index.names = ['a', 'b'] + + result = series.count(level='b') + expect = self.series.count(level=1) + assert_series_equal(result, expect) + + result = series.count(level='a') + expect = self.series.count(level=0) + assert_series_equal(result, expect) + + self.assertRaises(Exception, series.count, 'x') + self.assertRaises(Exception, frame.count, level='x') + + AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', + 'mad', 'std', 'var'] + + def test_series_group_min_max(self): + for op, level, skipna in cart_product(self.AGG_FUNCTIONS, + range(2), + [False, True]): + grouped = self.series.groupby(level=level) + aggf = lambda x: getattr(x, op)(skipna=skipna) + # skipna=True + leftside = grouped.agg(aggf) + rightside = getattr(self.series, op)(level=level, skipna=skipna) + assert_series_equal(leftside, rightside) + + def test_frame_group_ops(self): + self.frame.ix[1, [1, 2]] = np.nan + self.frame.ix[7, [0, 1]] = np.nan + + for op, level, axis, skipna in cart_product(self.AGG_FUNCTIONS, + range(2), range(2), + [False, True]): + if axis == 0: + frame = self.frame + else: + frame = self.frame.T + + grouped = frame.groupby(level=level, axis=axis) + + pieces = [] + def aggf(x): + pieces.append(x) + return getattr(x, op)(skipna=skipna, axis=axis) + leftside = grouped.agg(aggf) + rightside = getattr(frame, op)(level=level, axis=axis, + skipna=skipna) + + # for good measure, groupby detail + level_index = frame._get_axis(axis).levels[level] + + self.assert_(leftside._get_axis(axis).equals(level_index)) + self.assert_(rightside._get_axis(axis).equals(level_index)) + + assert_frame_equal(leftside, rightside) + + def test_frame_any_all_group(self): + df = DataFrame({'data': [False, False, True, False, True, False, True]}, + index=[['one', 'one', 'two', 'one', 'two', 'two', 'two'], + [0, 1, 0, 2, 1, 2, 3]]) + + result = df.any(level=0) + ex = DataFrame({'data': [False, True]}, index=['one', 'two']) + assert_frame_equal(result, ex) + + result = df.all(level=0) + ex = DataFrame({'data': [False, False]}, index=['one', 'two']) + assert_frame_equal(result, ex) + + def test_std_var_pass_ddof(self): + index = MultiIndex.from_arrays([np.arange(5).repeat(10), + np.tile(np.arange(10), 5)]) + df = DataFrame(np.random.randn(len(index), 5), index=index) + + for meth in ['var', 'std']: + ddof = 4 + alt = lambda x: getattr(x, meth)(ddof=ddof) + + result = getattr(df[0], meth)(level=0, ddof=ddof) + expected = df[0].groupby(level=0).agg(alt) + assert_series_equal(result, expected) + + result = getattr(df, meth)(level=0, ddof=ddof) + expected = df.groupby(level=0).agg(alt) + assert_frame_equal(result, expected) + + + def test_frame_series_agg_multiple_levels(self): + result = self.ymd.sum(level=['year', 'month']) + expected = self.ymd.groupby(level=['year', 'month']).sum() + assert_frame_equal(result, expected) + + result = self.ymd['A'].sum(level=['year', 'month']) + expected = self.ymd['A'].groupby(level=['year', 'month']).sum() + assert_series_equal(result, expected) + + def test_groupby_multilevel(self): + result = self.ymd.groupby(level=[0, 1]).mean() + + k1 = self.ymd.index.get_level_values(0) + k2 = self.ymd.index.get_level_values(1) + + expected = self.ymd.groupby([k1, k2]).mean() + + assert_frame_equal(result, expected) + self.assertEquals(result.index.names, self.ymd.index.names[:2]) + + result2 = self.ymd.groupby(level=self.ymd.index.names[:2]).mean() + assert_frame_equal(result, result2) + + def test_groupby_multilevel_with_transform(self): + pass + + def test_multilevel_consolidate(self): + index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), + ('bar', 'one'), ('bar', 'two')]) + df = DataFrame(np.random.randn(4, 4), index=index, columns=index) + df['Totals', ''] = df.sum(1) + df = df.consolidate() + + def test_ix_preserve_names(self): + result = self.ymd.ix[2000] + result2 = self.ymd['A'].ix[2000] + self.assertEquals(result.index.names, self.ymd.index.names[1:]) + self.assertEquals(result2.index.names, self.ymd.index.names[1:]) + + result = self.ymd.ix[2000, 2] + result2 = self.ymd['A'].ix[2000, 2] + self.assertEquals(result.index.name, self.ymd.index.names[2]) + self.assertEquals(result2.index.name, self.ymd.index.names[2]) + + def test_partial_set(self): + # GH #397 + df = self.ymd.copy() + exp = self.ymd.copy() + df.ix[2000, 4] = 0 + exp.ix[2000, 4].values[:] = 0 + assert_frame_equal(df, exp) + + df['A'].ix[2000, 4] = 1 + exp['A'].ix[2000, 4].values[:] = 1 + assert_frame_equal(df, exp) + + df.ix[2000] = 5 + exp.ix[2000].values[:] = 5 + assert_frame_equal(df, exp) + + # this works...for now + df['A'].ix[14] = 5 + self.assertEquals(df['A'][14], 5) + + def test_unstack_preserve_types(self): + # GH #403 + self.ymd['E'] = 'foo' + self.ymd['F'] = 2 + + unstacked = self.ymd.unstack('month') + self.assert_(unstacked['A', 1].dtype == np.float64) + self.assert_(unstacked['E', 1].dtype == np.object_) + self.assert_(unstacked['F', 1].dtype == np.float64) + + def test_getitem_lowerdim_corner(self): + self.assertRaises(KeyError, self.frame.ix.__getitem__, + (('bar', 'three'), 'B')) + + self.assertRaises(KeyError, self.frame.ix.__setitem__, + (('bar', 'three'), 'B'), 0) + + #---------------------------------------------------------------------- + # AMBIGUOUS CASES! + + def test_partial_ix_missing(self): + raise nose.SkipTest + + result = self.ymd.ix[2000, 0] + expected = self.ymd.ix[2000]['A'] + assert_series_equal(result, expected) + + # need to put in some work here + + # self.ymd.ix[2000, 0] = 0 + # self.assert_((self.ymd.ix[2000]['A'] == 0).all()) + + self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6)) + self.assertRaises(Exception, self.ymd.ix.__getitem__, (2000, 6), 0) + + #---------------------------------------------------------------------- + + def test_to_html(self): + self.ymd.columns.name = 'foo' + self.ymd.to_html() + self.ymd.T.to_html() + + def test_level_with_tuples(self): + index = MultiIndex(levels=[[('foo', 'bar', 0), ('foo', 'baz', 0), + ('foo', 'qux', 0)], + [0, 1]], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + result = series[('foo', 'bar', 0)] + result2 = series.ix[('foo', 'bar', 0)] + expected = series[:2] + expected.index = expected.index.droplevel(0) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + self.assertRaises(KeyError, series.__getitem__, (('foo', 'bar', 0), 2)) + + result = frame.ix[('foo', 'bar', 0)] + result2 = frame.xs(('foo', 'bar', 0)) + expected = frame[:2] + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + index = MultiIndex(levels=[[('foo', 'bar'), ('foo', 'baz'), + ('foo', 'qux')], + [0, 1]], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]]) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + result = series[('foo', 'bar')] + result2 = series.ix[('foo', 'bar')] + expected = series[:2] + expected.index = expected.index.droplevel(0) + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + result = frame.ix[('foo', 'bar')] + result2 = frame.xs(('foo', 'bar')) + expected = frame[:2] + expected.index = expected.index.droplevel(0) + assert_frame_equal(result, expected) + assert_frame_equal(result2, expected) + + def test_int_series_slicing(self): + s = self.ymd['A'] + result = s[5:] + expected = s.reindex(s.index[5:]) + assert_series_equal(result, expected) + + exp = self.ymd['A'].copy() + s[5:] = 0 + exp.values[5:] = 0 + self.assert_(np.array_equal(s.values, exp.values)) + + result = self.ymd[5:] + expected = self.ymd.reindex(s.index[5:]) + assert_frame_equal(result, expected) + + def test_mixed_depth_get(self): + arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], + [ '', 'wx', 'wy', '', '', '']] + + tuples = zip(*arrays) + tuples.sort() + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4,6),columns = index) + + result = df['a'] + expected = df['a','',''] + assert_series_equal(result, expected) + self.assertEquals(result.name, 'a') + + result = df['routine1','result1'] + expected = df['routine1','result1',''] + assert_series_equal(result, expected) + self.assertEquals(result.name, ('routine1', 'result1')) + + def test_mixed_depth_insert(self): + arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], + [ '', 'wx', 'wy', '', '', '']] + + tuples = zip(*arrays) + tuples.sort() + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4,6),columns = index) + + result = df.copy() + expected = df.copy() + result['b'] = [1,2,3,4] + expected['b','',''] = [1,2,3,4] + assert_frame_equal(result, expected) + + def test_mixed_depth_drop(self): + arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], + [ '', 'wx', 'wy', '', '', '']] + + tuples = zip(*arrays) + tuples.sort() + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4,6),columns = index) + + result = df.drop('a',axis=1) + expected = df.drop([('a','','')],axis=1) + assert_frame_equal(expected, result) + + result = df.drop(['top'],axis=1) + expected = df.drop([('top','OD','wx')], axis=1) + expected = expected.drop([('top','OD','wy')], axis=1) + assert_frame_equal(expected, result) + + result = df.drop(('top', 'OD', 'wx'), axis=1) + expected = df.drop([('top','OD','wx')], axis=1) + assert_frame_equal(expected, result) + + expected = df.drop([('top','OD','wy')], axis=1) + expected = df.drop('top', axis=1) + + result = df.drop('result1', level=1, axis=1) + expected = df.drop([('routine1', 'result1', ''), + ('routine2', 'result1', '')], axis=1) + assert_frame_equal(expected, result) + + def test_mixed_depth_pop(self): + arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], + [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], + [ '', 'wx', 'wy', '', '', '']] + + tuples = zip(*arrays) + tuples.sort() + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4,6),columns = index) + + df1 = df.copy() + df2 = df.copy() + result = df1.pop('a') + expected = df2.pop(('a','','')) + assert_series_equal(expected, result) + assert_frame_equal(df1, df2) + self.assertEquals(result.name,'a') + + expected = df1['top'] + df1 = df1.drop(['top'],axis=1) + result = df2.pop('top') + assert_frame_equal(expected, result) + assert_frame_equal(df1, df2) + + def test_reindex_level_partial_selection(self): + result = self.frame.reindex(['foo', 'qux'], level=0) + expected = self.frame.ix[[0, 1, 2, 7, 8, 9]] + assert_frame_equal(result, expected) + + result = self.frame.T.reindex_axis(['foo', 'qux'], axis=1, level=0) + assert_frame_equal(result, expected.T) + + result = self.frame.ix[['foo', 'qux']] + assert_frame_equal(result, expected) + + result = self.frame['A'].ix[['foo', 'qux']] + assert_series_equal(result, expected['A']) + + result = self.frame.T.ix[:, ['foo', 'qux']] + assert_frame_equal(result, expected.T) + + def test_setitem_multiple_partial(self): + expected = self.frame.copy() + result = self.frame.copy() + result.ix[['foo', 'bar']] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_frame_equal(result, expected) + + expected = self.frame.copy() + result = self.frame.copy() + result.ix['foo':'bar'] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_frame_equal(result, expected) + + expected = self.frame['A'].copy() + result = self.frame['A'].copy() + result.ix[['foo', 'bar']] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_series_equal(result, expected) + + expected = self.frame['A'].copy() + result = self.frame['A'].copy() + result.ix['foo':'bar'] = 0 + expected.ix['foo'] = 0 + expected.ix['bar'] = 0 + assert_series_equal(result, expected) + + def test_drop_level(self): + result = self.frame.drop(['bar', 'qux'], level='first') + expected = self.frame.ix[[0, 1, 2, 5, 6]] + assert_frame_equal(result, expected) + + result = self.frame.drop(['two'], level='second') + expected = self.frame.ix[[0, 2, 3, 6, 7, 9]] + assert_frame_equal(result, expected) + + result = self.frame.T.drop(['bar', 'qux'], axis=1, level='first') + expected = self.frame.ix[[0, 1, 2, 5, 6]].T + assert_frame_equal(result, expected) + + result = self.frame.T.drop(['two'], axis=1, level='second') + expected = self.frame.ix[[0, 2, 3, 6, 7, 9]].T + assert_frame_equal(result, expected) + + def test_drop_preserve_names(self): + index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [1, 2, 3, 1, 2, 3]], + names=['one', 'two']) + + df = DataFrame(np.random.randn(6, 3), index=index) + + result = df.drop([(0, 2)]) + self.assert_(result.index.names == ['one', 'two']) + + def test_unicode_repr_issues(self): + levels = [Index([u'a/\u03c3', u'b/\u03c3',u'c/\u03c3']), + Index([0, 1])] + labels = [np.arange(3).repeat(2), np.tile(np.arange(2), 3)] + index = MultiIndex(levels=levels, labels=labels) + + repr(index.levels) + + # NumPy bug + # repr(index.get_level_values(1)) + + def test_dataframe_insert_column_all_na(self): + # GH #1534 + mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c')]) + df = DataFrame([[1,2],[3,4],[5,6]], index=mix) + s = Series({(1,1): 1, (1,2): 2}) + df['new'] = s + self.assert_(df['new'].isnull().all()) + + def test_join_segfault(self): + # 1532 + df1 = DataFrame({'a': [1, 1], 'b': [1, 2], 'x': [1, 2]}) + df2 = DataFrame({'a': [2, 2], 'b': [1, 2], 'y': [1, 2]}) + df1 = df1.set_index(['a', 'b']) + df2 = df2.set_index(['a', 'b']) + # it works! + for how in ['left', 'right', 'outer']: + df1.join(df2, how=how) + + def test_set_column_scalar_with_ix(self): + subset = self.frame.index[[1, 4, 5]] + + self.frame.ix[subset] = 99 + self.assert_((self.frame.ix[subset].values == 99).all()) + + col = self.frame['B'] + col[subset] = 97 + self.assert_((self.frame.ix[subset, 'B'] == 97).all()) + +if __name__ == '__main__': + + # unittest.main() + import nose + # nose.runmodule(argv=[__file__,'-vvs','-x', '--pdb-failure'], + # exit=False) + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_ndframe.py b/pandas/tests/test_ndframe.py new file mode 100644 index 00000000..70a5d79d --- /dev/null +++ b/pandas/tests/test_ndframe.py @@ -0,0 +1,30 @@ +import unittest + +import numpy as np + +from pandas.core.generic import NDFrame +import pandas.util.testing as t + +class TestNDFrame(unittest.TestCase): + + def setUp(self): + tdf = t.makeTimeDataFrame() + self.ndf = NDFrame(tdf._data) + + def test_constructor(self): + # with cast + ndf = NDFrame(self.ndf._data, dtype=np.int64) + self.assert_(ndf.values.dtype == np.int64) + + def test_ndim(self): + self.assertEquals(self.ndf.ndim, 2) + + def test_astype(self): + casted = self.ndf.astype(int) + self.assert_(casted.values.dtype == np.int64) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py new file mode 100644 index 00000000..dcb055ad --- /dev/null +++ b/pandas/tests/test_panel.py @@ -0,0 +1,1336 @@ +# pylint: disable=W0612,E1101 + +from datetime import datetime +import operator +import unittest +import nose + +import numpy as np + +from pandas import DataFrame, Index, isnull, notnull, pivot, MultiIndex +from pandas.core.datetools import bday +from pandas.core.frame import group_agg +from pandas.core.panel import Panel +from pandas.core.series import remove_na +import pandas.core.common as com +from pandas.util import py3compat + +from pandas.util.testing import (assert_panel_equal, + assert_frame_equal, + assert_series_equal, + assert_almost_equal) +import pandas.core.panel as panelm +import pandas.util.testing as tm + +class PanelTests(object): + panel = None + + def test_pickle(self): + import cPickle + pickled = cPickle.dumps(self.panel) + unpickled = cPickle.loads(pickled) + assert_frame_equal(unpickled['ItemA'], self.panel['ItemA']) + + def test_cumsum(self): + cumsum = self.panel.cumsum() + assert_frame_equal(cumsum['ItemA'], self.panel['ItemA'].cumsum()) + +class SafeForLongAndSparse(object): + + def test_repr(self): + foo = repr(self.panel) + + def test_iter(self): + tm.equalContents(list(self.panel), self.panel.items) + + def test_count(self): + f = lambda s: notnull(s).sum() + self._check_stat_op('count', f, obj=self.panel, has_skipna=False) + + def test_sum(self): + self._check_stat_op('sum', np.sum) + + def test_mean(self): + self._check_stat_op('mean', np.mean) + + def test_prod(self): + self._check_stat_op('prod', np.prod) + + def test_median(self): + def wrapper(x): + if isnull(x).any(): + return np.nan + return np.median(x) + + self._check_stat_op('median', wrapper) + + def test_min(self): + self._check_stat_op('min', np.min) + + def test_max(self): + self._check_stat_op('max', np.max) + + def test_skew(self): + try: + from scipy.stats import skew + except ImportError: + raise nose.SkipTest + def this_skew(x): + if len(x) < 3: + return np.nan + return skew(x, bias=False) + self._check_stat_op('skew', this_skew) + + # def test_mad(self): + # f = lambda x: np.abs(x - x.mean()).mean() + # self._check_stat_op('mad', f) + + def test_var(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.var(x, ddof=1) + self._check_stat_op('var', alt) + + def test_std(self): + def alt(x): + if len(x) < 2: + return np.nan + return np.std(x, ddof=1) + self._check_stat_op('std', alt) + + # def test_skew(self): + # from scipy.stats import skew + + # def alt(x): + # if len(x) < 3: + # return np.nan + # return skew(x, bias=False) + + # self._check_stat_op('skew', alt) + + def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): + if obj is None: + obj = self.panel + + # # set some NAs + # obj.ix[5:10] = np.nan + # obj.ix[15:20, -2:] = np.nan + + f = getattr(obj, name) + + if has_skipna: + def skipna_wrapper(x): + nona = remove_na(x) + if len(nona) == 0: + return np.nan + return alternative(nona) + + def wrapper(x): + return alternative(np.asarray(x)) + + for i in range(obj.ndim): + result = f(axis=i, skipna=False) + assert_frame_equal(result, obj.apply(wrapper, axis=i)) + else: + skipna_wrapper = alternative + wrapper = alternative + + for i in range(obj.ndim): + result = f(axis=i) + assert_frame_equal(result, obj.apply(skipna_wrapper, axis=i)) + + self.assertRaises(Exception, f, axis=obj.ndim) + +class SafeForSparse(object): + + @classmethod + def assert_panel_equal(cls, x, y): + assert_panel_equal(x, y) + + def test_get_axis(self): + assert(self.panel._get_axis(0) is self.panel.items) + assert(self.panel._get_axis(1) is self.panel.major_axis) + assert(self.panel._get_axis(2) is self.panel.minor_axis) + + def test_set_axis(self): + new_items = Index(np.arange(len(self.panel.items))) + new_major = Index(np.arange(len(self.panel.major_axis))) + new_minor = Index(np.arange(len(self.panel.minor_axis))) + + # ensure propagate to potentially prior-cached items too + item = self.panel['ItemA'] + self.panel.items = new_items + + if hasattr(self.panel, '_item_cache'): + self.assert_('ItemA' not in self.panel._item_cache) + self.assert_(self.panel.items is new_items) + + item = self.panel[0] + self.panel.major_axis = new_major + self.assert_(self.panel[0].index is new_major) + self.assert_(self.panel.major_axis is new_major) + + item = self.panel[0] + self.panel.minor_axis = new_minor + self.assert_(self.panel[0].columns is new_minor) + self.assert_(self.panel.minor_axis is new_minor) + + def test_get_axis_number(self): + self.assertEqual(self.panel._get_axis_number('items'), 0) + self.assertEqual(self.panel._get_axis_number('major'), 1) + self.assertEqual(self.panel._get_axis_number('minor'), 2) + + def test_get_axis_name(self): + self.assertEqual(self.panel._get_axis_name(0), 'items') + self.assertEqual(self.panel._get_axis_name(1), 'major_axis') + self.assertEqual(self.panel._get_axis_name(2), 'minor_axis') + + def test_get_plane_axes(self): + # what to do here? + + index, columns = self.panel._get_plane_axes('items') + index, columns = self.panel._get_plane_axes('major_axis') + index, columns = self.panel._get_plane_axes('minor_axis') + index, columns = self.panel._get_plane_axes(0) + + def test_truncate(self): + dates = self.panel.major_axis + start, end = dates[1], dates[5] + + trunced = self.panel.truncate(start, end, axis='major') + expected = self.panel['ItemA'].truncate(start, end) + + assert_frame_equal(trunced['ItemA'], expected) + + trunced = self.panel.truncate(before=start, axis='major') + expected = self.panel['ItemA'].truncate(before=start) + + assert_frame_equal(trunced['ItemA'], expected) + + trunced = self.panel.truncate(after=end, axis='major') + expected = self.panel['ItemA'].truncate(after=end) + + assert_frame_equal(trunced['ItemA'], expected) + + # XXX test other axes + + def test_arith(self): + self._test_op(self.panel, operator.add) + self._test_op(self.panel, operator.sub) + self._test_op(self.panel, operator.mul) + self._test_op(self.panel, operator.truediv) + self._test_op(self.panel, operator.floordiv) + self._test_op(self.panel, operator.pow) + + self._test_op(self.panel, lambda x, y: y + x) + self._test_op(self.panel, lambda x, y: y - x) + self._test_op(self.panel, lambda x, y: y * x) + self._test_op(self.panel, lambda x, y: y / x) + self._test_op(self.panel, lambda x, y: y ** x) + + self.assertRaises(Exception, self.panel.__add__, self.panel['ItemA']) + + @staticmethod + def _test_op(panel, op): + result = op(panel, 1) + assert_frame_equal(result['ItemA'], op(panel['ItemA'], 1)) + + def test_keys(self): + tm.equalContents(self.panel.keys(), self.panel.items) + + def test_iteritems(self): + # Test panel.iteritems(), aka panel.iterkv() + # just test that it works + for k, v in self.panel.iterkv(): + pass + + self.assertEqual(len(list(self.panel.iterkv())), + len(self.panel.items)) + + def test_combineFrame(self): + def check_op(op, name): + # items + df = self.panel['ItemA'] + + func = getattr(self.panel, name) + + result = func(df, axis='items') + + assert_frame_equal(result['ItemB'], op(self.panel['ItemB'], df)) + + # major + xs = self.panel.major_xs(self.panel.major_axis[0]) + result = func(xs, axis='major') + + idx = self.panel.major_axis[1] + + assert_frame_equal(result.major_xs(idx), + op(self.panel.major_xs(idx), xs)) + + # minor + xs = self.panel.minor_xs(self.panel.minor_axis[0]) + result = func(xs, axis='minor') + + idx = self.panel.minor_axis[1] + + assert_frame_equal(result.minor_xs(idx), + op(self.panel.minor_xs(idx), xs)) + + check_op(operator.add, 'add') + check_op(operator.sub, 'subtract') + check_op(operator.mul, 'multiply') + if py3compat.PY3: + check_op(operator.truediv, 'divide') + else: + check_op(operator.div, 'divide') + + def test_combinePanel(self): + result = self.panel.add(self.panel) + self.assert_panel_equal(result, self.panel * 2) + + def test_neg(self): + self.assert_panel_equal(-self.panel, self.panel * -1) + + def test_select(self): + p = self.panel + + # select items + result = p.select(lambda x: x in ('ItemA', 'ItemC'), axis='items') + expected = p.reindex(items=['ItemA', 'ItemC']) + self.assert_panel_equal(result, expected) + + # select major_axis + result = p.select(lambda x: x >= datetime(2000, 1, 15), axis='major') + new_major = p.major_axis[p.major_axis >= datetime(2000, 1, 15)] + expected = p.reindex(major=new_major) + self.assert_panel_equal(result, expected) + + # select minor_axis + result = p.select(lambda x: x in ('D', 'A'), axis=2) + expected = p.reindex(minor=['A', 'D']) + self.assert_panel_equal(result, expected) + + # corner case, empty thing + result = p.select(lambda x: x in ('foo',), axis='items') + self.assert_panel_equal(result, p.reindex(items=[])) + + def test_get_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + result = self.panel.get_value(item, mjr, mnr) + expected = self.panel[item][mnr][mjr] + assert_almost_equal(result, expected) + + def test_abs(self): + result = self.panel.abs() + expected = np.abs(self.panel) + self.assert_panel_equal(result, expected) + + df = self.panel['ItemA'] + result = df.abs() + expected = np.abs(df) + assert_frame_equal(result, expected) + + s = df['A'] + result = s.abs() + expected = np.abs(s) + assert_series_equal(result, expected) + +class CheckIndexing(object): + + + def test_getitem(self): + self.assertRaises(Exception, self.panel.__getitem__, 'ItemQ') + + def test_delitem_and_pop(self): + expected = self.panel['ItemA'] + result = self.panel.pop('ItemA') + assert_frame_equal(expected, result) + self.assert_('ItemA' not in self.panel.items) + + del self.panel['ItemB'] + self.assert_('ItemB' not in self.panel.items) + self.assertRaises(Exception, self.panel.__delitem__, 'ItemB') + + values = np.empty((3, 3, 3)) + values[0] = 0 + values[1] = 1 + values[2] = 2 + + panel = Panel(values, range(3), range(3), range(3)) + + # did we delete the right row? + + panelc = panel.copy() + del panelc[0] + assert_frame_equal(panelc[1], panel[1]) + assert_frame_equal(panelc[2], panel[2]) + + panelc = panel.copy() + del panelc[1] + assert_frame_equal(panelc[0], panel[0]) + assert_frame_equal(panelc[2], panel[2]) + + panelc = panel.copy() + del panelc[2] + assert_frame_equal(panelc[1], panel[1]) + assert_frame_equal(panelc[0], panel[0]) + + def test_setitem(self): + # LongPanel with one item + lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() + self.assertRaises(Exception, self.panel.__setitem__, + 'ItemE', lp) + + # DataFrame + df = self.panel['ItemA'][2:].filter(items=['A', 'B']) + self.panel['ItemF'] = df + self.panel['ItemE'] = df + + df2 = self.panel['ItemF'] + + assert_frame_equal(df, df2.reindex(index=df.index, + columns=df.columns)) + + # scalar + self.panel['ItemG'] = 1 + self.panel['ItemE'] = True + self.assert_(self.panel['ItemG'].values.dtype == np.int64) + self.assert_(self.panel['ItemE'].values.dtype == np.bool_) + + # object dtype + self.panel['ItemQ'] = 'foo' + self.assert_(self.panel['ItemQ'].values.dtype == np.object_) + + # boolean dtype + self.panel['ItemP'] = self.panel['ItemA'] > 0 + self.assert_(self.panel['ItemP'].values.dtype == np.bool_) + + def test_setitem_ndarray(self): + from pandas import date_range, datetools + + timeidx = date_range(start=datetime(2009,1,1), + end=datetime(2009,12,31), + freq=datetools.MonthEnd()) + lons_coarse = np.linspace(-177.5, 177.5, 72) + lats_coarse = np.linspace(-87.5, 87.5, 36) + P = Panel(items=timeidx, major_axis=lons_coarse, + minor_axis=lats_coarse) + data = np.random.randn(72*36).reshape((72,36)) + key = datetime(2009,2,28) + P[key] = data + + assert_almost_equal(P[key].values, data) + + def test_major_xs(self): + ref = self.panel['ItemA'] + + idx = self.panel.major_axis[5] + xs = self.panel.major_xs(idx) + + assert_series_equal(xs['ItemA'], ref.xs(idx)) + + # not contained + idx = self.panel.major_axis[0] - bday + self.assertRaises(Exception, self.panel.major_xs, idx) + + def test_major_xs_mixed(self): + self.panel['ItemD'] = 'foo' + xs = self.panel.major_xs(self.panel.major_axis[0]) + self.assert_(xs['ItemA'].dtype == np.float64) + self.assert_(xs['ItemD'].dtype == np.object_) + + def test_minor_xs(self): + ref = self.panel['ItemA'] + + idx = self.panel.minor_axis[1] + xs = self.panel.minor_xs(idx) + + assert_series_equal(xs['ItemA'], ref[idx]) + + # not contained + self.assertRaises(Exception, self.panel.minor_xs, 'E') + + def test_minor_xs_mixed(self): + self.panel['ItemD'] = 'foo' + + xs = self.panel.minor_xs('D') + self.assert_(xs['ItemA'].dtype == np.float64) + self.assert_(xs['ItemD'].dtype == np.object_) + + def test_xs(self): + itemA = self.panel.xs('ItemA', axis=0) + expected = self.panel['ItemA'] + assert_frame_equal(itemA, expected) + + # not view by default + itemA.values[:] = np.nan + self.assert_(not np.isnan(self.panel['ItemA'].values).all()) + + # but can get view + itemA_view = self.panel.xs('ItemA', axis=0, copy=False) + itemA_view.values[:] = np.nan + self.assert_(np.isnan(self.panel['ItemA'].values).all()) + + # mixed-type + self.panel['strings'] = 'foo' + self.assertRaises(Exception, self.panel.xs, 'D', axis=2, + copy=False) + + def test_getitem_fancy_labels(self): + p = self.panel + + items = p.items[[1, 0]] + dates = p.major_axis[::2] + cols = ['D', 'C', 'F'] + + # all 3 specified + assert_panel_equal(p.ix[items, dates, cols], + p.reindex(items=items, major=dates, minor=cols)) + + # 2 specified + assert_panel_equal(p.ix[:, dates, cols], + p.reindex(major=dates, minor=cols)) + + assert_panel_equal(p.ix[items, :, cols], + p.reindex(items=items, minor=cols)) + + assert_panel_equal(p.ix[items, dates, :], + p.reindex(items=items, major=dates)) + + # only 1 + assert_panel_equal(p.ix[items, :, :], + p.reindex(items=items)) + + assert_panel_equal(p.ix[:, dates, :], + p.reindex(major=dates)) + + assert_panel_equal(p.ix[:, :, cols], + p.reindex(minor=cols)) + + def test_getitem_fancy_slice(self): + pass + + def test_getitem_fancy_ints(self): + pass + + def test_getitem_fancy_xs(self): + p = self.panel + item = 'ItemB' + + date = p.major_axis[5] + col = 'C' + + # get DataFrame + # item + assert_frame_equal(p.ix[item], p[item]) + assert_frame_equal(p.ix[item, :], p[item]) + assert_frame_equal(p.ix[item, :, :], p[item]) + + # major axis, axis=1 + assert_frame_equal(p.ix[:, date], p.major_xs(date)) + assert_frame_equal(p.ix[:, date, :], p.major_xs(date)) + + # minor axis, axis=2 + assert_frame_equal(p.ix[:, :, 'C'], p.minor_xs('C')) + + # get Series + assert_series_equal(p.ix[item, date], p[item].ix[date]) + assert_series_equal(p.ix[item, date, :], p[item].ix[date]) + assert_series_equal(p.ix[item, :, col], p[item][col]) + assert_series_equal(p.ix[:, date, col], p.major_xs(date).ix[col]) + + def test_getitem_fancy_xs_check_view(self): + item = 'ItemB' + date = self.panel.major_axis[5] + col = 'C' + + # make sure it's always a view + NS = slice(None, None) + + # DataFrames + comp = assert_frame_equal + self._check_view(item, comp) + self._check_view((item, NS), comp) + self._check_view((item, NS, NS), comp) + self._check_view((NS, date), comp) + self._check_view((NS, date, NS), comp) + self._check_view((NS, NS, 'C'), comp) + + # Series + comp = assert_series_equal + self._check_view((item, date), comp) + self._check_view((item, date, NS), comp) + self._check_view((item, NS, 'C'), comp) + self._check_view((NS, date, 'C'), comp) + + def test_ix_setitem_slice_dataframe(self): + a = Panel(items=[1,2,3],major_axis=[11,22,33],minor_axis=[111,222,333]) + b = DataFrame(np.random.randn(2,3), index=[111,333], + columns=[1,2,3]) + + a.ix[:, 22, [111, 333]] = b + + assert_frame_equal(a.ix[:, 22, [111, 333]], b) + + + def _check_view(self, indexer, comp): + cp = self.panel.copy() + obj = cp.ix[indexer] + obj.values[:] = 0 + self.assert_((obj.values == 0).all()) + comp(cp.ix[indexer].reindex_like(obj), obj) + + def test_get_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + result = self.panel.get_value(item, mjr, mnr) + expected = self.panel[item][mnr][mjr] + assert_almost_equal(result, expected) + + def test_set_value(self): + for item in self.panel.items: + for mjr in self.panel.major_axis[::2]: + for mnr in self.panel.minor_axis: + self.panel.set_value(item, mjr, mnr, 1.) + assert_almost_equal(self.panel[item][mnr][mjr], 1.) + + # resize + res = self.panel.set_value('ItemE', 'foo', 'bar', 1.5) + self.assert_(isinstance(res, Panel)) + self.assert_(res is not self.panel) + self.assertEqual(res.get_value('ItemE', 'foo', 'bar'), 1.5) + + res3 = self.panel.set_value('ItemE', 'foobar', 'baz', 5) + self.assert_(com.is_float_dtype(res3['ItemE'].values)) + +class TestPanel(unittest.TestCase, PanelTests, CheckIndexing, + SafeForLongAndSparse, + SafeForSparse): + + @classmethod + def assert_panel_equal(cls,x, y): + assert_panel_equal(x, y) + + def setUp(self): + self.panel = tm.makePanel() + tm.add_nans(self.panel) + + def test_constructor(self): + # with BlockManager + wp = Panel(self.panel._data) + self.assert_(wp._data is self.panel._data) + + wp = Panel(self.panel._data, copy=True) + self.assert_(wp._data is not self.panel._data) + assert_panel_equal(wp, self.panel) + + # strings handled prop + wp = Panel([[['foo', 'foo', 'foo',], + ['foo', 'foo', 'foo']]]) + self.assert_(wp.values.dtype == np.object_) + + vals = self.panel.values + + # no copy + wp = Panel(vals) + self.assert_(wp.values is vals) + + # copy + wp = Panel(vals, copy=True) + self.assert_(wp.values is not vals) + + def test_constructor_cast(self): + zero_filled = self.panel.fillna(0) + + casted = Panel(zero_filled._data, dtype=int) + casted2 = Panel(zero_filled.values, dtype=int) + + exp_values = zero_filled.values.astype(int) + assert_almost_equal(casted.values, exp_values) + assert_almost_equal(casted2.values, exp_values) + + # can't cast + data = [[['foo', 'bar', 'baz']]] + self.assertRaises(ValueError, Panel, data, dtype=float) + + def test_constructor_empty_panel(self): + empty = Panel() + self.assert_(len(empty.items) == 0) + self.assert_(len(empty.major_axis) == 0) + self.assert_(len(empty.minor_axis) == 0) + + def test_constructor_observe_dtype(self): + # GH #411 + panel = Panel(items=range(3), major_axis=range(3), + minor_axis=range(3), dtype='O') + self.assert_(panel.values.dtype == np.object_) + + def test_consolidate(self): + self.assert_(self.panel._data.is_consolidated()) + + self.panel['foo'] = 1. + self.assert_(not self.panel._data.is_consolidated()) + + panel = self.panel.consolidate() + self.assert_(panel._data.is_consolidated()) + + def test_ctor_dict(self): + itema = self.panel['ItemA'] + itemb = self.panel['ItemB'] + + d = {'A' : itema, 'B' : itemb[5:]} + d2 = {'A' : itema._series, 'B' : itemb[5:]._series} + d3 = {'A' : DataFrame(itema._series), + 'B' : DataFrame(itemb[5:]._series)} + + wp = Panel.from_dict(d) + wp2 = Panel.from_dict(d2) # nested Dict + wp3 = Panel.from_dict(d3) + self.assert_(wp.major_axis.equals(self.panel.major_axis)) + assert_panel_equal(wp, wp2) + + # intersect + wp = Panel.from_dict(d, intersect=True) + self.assert_(wp.major_axis.equals(itemb.index[5:])) + + # use constructor + assert_panel_equal(Panel(d), Panel.from_dict(d)) + assert_panel_equal(Panel(d2), Panel.from_dict(d2)) + assert_panel_equal(Panel(d3), Panel.from_dict(d3)) + + # cast + dcasted = dict((k, v.reindex(wp.major_axis).fillna(0)) + for k, v in d.iteritems()) + result = Panel(dcasted, dtype=int) + expected = Panel(dict((k, v.astype(int)) + for k, v in dcasted.iteritems())) + assert_panel_equal(result, expected) + + def test_constructor_dict_mixed(self): + data = dict((k, v.values) for k, v in self.panel.iterkv()) + result = Panel(data) + exp_major = Index(np.arange(len(self.panel.major_axis))) + self.assert_(result.major_axis.equals(exp_major)) + + result = Panel(data, items=self.panel.items, + major_axis=self.panel.major_axis, + minor_axis=self.panel.minor_axis) + assert_panel_equal(result, self.panel) + + data['ItemC'] = self.panel['ItemC'] + result = Panel(data) + assert_panel_equal(result, self.panel) + + # corner, blow up + data['ItemB'] = data['ItemB'][:-1] + self.assertRaises(Exception, Panel, data) + + data['ItemB'] = self.panel['ItemB'].values[:, :-1] + self.assertRaises(Exception, Panel, data) + + def test_constructor_resize(self): + data = self.panel._data + items = self.panel.items[:-1] + major = self.panel.major_axis[:-1] + minor = self.panel.minor_axis[:-1] + + result = Panel(data, items=items, major_axis=major, + minor_axis=minor) + expected = self.panel.reindex(items=items, major=major, minor=minor) + assert_panel_equal(result, expected) + + result = Panel(data, items=items, major_axis=major) + expected = self.panel.reindex(items=items, major=major) + assert_panel_equal(result, expected) + + result = Panel(data, items=items) + expected = self.panel.reindex(items=items) + assert_panel_equal(result, expected) + + result = Panel(data, minor_axis=minor) + expected = self.panel.reindex(minor=minor) + assert_panel_equal(result, expected) + + def test_from_dict_mixed_orient(self): + df = tm.makeDataFrame() + df['foo'] = 'bar' + + data = {'k1' : df, + 'k2' : df} + + panel = Panel.from_dict(data, orient='minor') + + self.assert_(panel['foo'].values.dtype == np.object_) + self.assert_(panel['A'].values.dtype == np.float64) + + def test_values(self): + self.assertRaises(Exception, Panel, np.random.randn(5, 5, 5), + range(5), range(5), range(4)) + + def test_conform(self): + df = self.panel['ItemA'][:-5].filter(items=['A', 'B']) + conformed = self.panel.conform(df) + + assert(conformed.index.equals(self.panel.major_axis)) + assert(conformed.columns.equals(self.panel.minor_axis)) + + def test_reindex(self): + ref = self.panel['ItemB'] + + # items + result = self.panel.reindex(items=['ItemA', 'ItemB']) + assert_frame_equal(result['ItemB'], ref) + + # major + new_major = list(self.panel.major_axis[:10]) + result = self.panel.reindex(major=new_major) + assert_frame_equal(result['ItemB'], ref.reindex(index=new_major)) + + # raise exception put both major and major_axis + self.assertRaises(Exception, self.panel.reindex, + major_axis=new_major, major=new_major) + + # minor + new_minor = list(self.panel.minor_axis[:2]) + result = self.panel.reindex(minor=new_minor) + assert_frame_equal(result['ItemB'], ref.reindex(columns=new_minor)) + + result = self.panel.reindex(items=self.panel.items, + major=self.panel.major_axis, + minor=self.panel.minor_axis) + + assert(result.items is self.panel.items) + assert(result.major_axis is self.panel.major_axis) + assert(result.minor_axis is self.panel.minor_axis) + + self.assertRaises(Exception, self.panel.reindex) + + # with filling + smaller_major = self.panel.major_axis[::5] + smaller = self.panel.reindex(major=smaller_major) + + larger = smaller.reindex(major=self.panel.major_axis, + method='pad') + + assert_frame_equal(larger.major_xs(self.panel.major_axis[1]), + smaller.major_xs(smaller_major[0])) + + # don't necessarily copy + result = self.panel.reindex(major=self.panel.major_axis, copy=False) + self.assert_(result is self.panel) + + def test_reindex_like(self): + # reindex_like + smaller = self.panel.reindex(items=self.panel.items[:-1], + major=self.panel.major_axis[:-1], + minor=self.panel.minor_axis[:-1]) + smaller_like = self.panel.reindex_like(smaller) + assert_panel_equal(smaller, smaller_like) + + def test_take(self): + # axis == 0 + result = self.panel.take([2, 0, 1], axis=0) + expected = self.panel.reindex(items=['ItemC', 'ItemA', 'ItemB']) + assert_panel_equal(result, expected) + + # axis >= 1 + result = self.panel.take([3, 0, 1, 2], axis=2) + expected = self.panel.reindex(minor=['D', 'A', 'B', 'C']) + assert_panel_equal(result, expected) + + self.assertRaises(Exception, self.panel.take, [3, -1, 1, 2], axis=2) + self.assertRaises(Exception, self.panel.take, [4, 0, 1, 2], axis=2) + + def test_sort_index(self): + import random + + ritems = list(self.panel.items) + rmajor = list(self.panel.major_axis) + rminor = list(self.panel.minor_axis) + random.shuffle(ritems) + random.shuffle(rmajor) + random.shuffle(rminor) + + random_order = self.panel.reindex(items=ritems) + sorted_panel = random_order.sort_index(axis=0) + assert_panel_equal(sorted_panel, self.panel) + + # descending + random_order = self.panel.reindex(items=ritems) + sorted_panel = random_order.sort_index(axis=0, ascending=False) + assert_panel_equal(sorted_panel, + self.panel.reindex(items=self.panel.items[::-1])) + + random_order = self.panel.reindex(major=rmajor) + sorted_panel = random_order.sort_index(axis=1) + assert_panel_equal(sorted_panel, self.panel) + + random_order = self.panel.reindex(minor=rminor) + sorted_panel = random_order.sort_index(axis=2) + assert_panel_equal(sorted_panel, self.panel) + + def test_fillna(self): + filled = self.panel.fillna(0) + self.assert_(np.isfinite(filled.values).all()) + + filled = self.panel.fillna(method='backfill') + assert_frame_equal(filled['ItemA'], + self.panel['ItemA'].fillna(method='backfill')) + + panel = self.panel.copy() + panel['str'] = 'foo' + + filled = panel.fillna(method='backfill') + assert_frame_equal(filled['ItemA'], + panel['ItemA'].fillna(method='backfill')) + + empty = self.panel.reindex(items=[]) + filled = empty.fillna(0) + assert_panel_equal(filled, empty) + + def test_swapaxes(self): + result = self.panel.swapaxes('items', 'minor') + self.assert_(result.items is self.panel.minor_axis) + + result = self.panel.swapaxes('items', 'major') + self.assert_(result.items is self.panel.major_axis) + + result = self.panel.swapaxes('major', 'minor') + self.assert_(result.major_axis is self.panel.minor_axis) + + panel = self.panel.copy() + result = panel.swapaxes('major', 'minor') + panel.values[0, 0, 1] = np.nan + expected = panel.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + # this should also work + result = self.panel.swapaxes(0, 1) + self.assert_(result.items is self.panel.major_axis) + + # this should not work + self.assertRaises(Exception, self.panel.swapaxes, 'items', 'items') + + def test_transpose(self): + result = self.panel.transpose('minor', 'major', 'items') + expected = self.panel.swapaxes('items', 'minor') + assert_panel_equal(result, expected) + + result = self.panel.transpose(2, 1, 0) + assert_panel_equal(result, expected) + + result = self.panel.transpose('minor', 'items', 'major') + expected = self.panel.swapaxes('items', 'minor') + expected = expected.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + result = self.panel.transpose(2, 0, 1) + assert_panel_equal(result, expected) + + self.assertRaises(ValueError, self.panel.transpose, 0, 0, 1) + + def test_transpose_copy(self): + panel = self.panel.copy() + result = panel.transpose(2, 0, 1, copy=True) + expected = panel.swapaxes('items', 'minor') + expected = expected.swapaxes('major', 'minor') + assert_panel_equal(result, expected) + + panel.values[0, 1, 1] = np.nan + self.assert_(notnull(result.values[1, 0, 1])) + + def test_to_frame(self): + # filtered + filtered = self.panel.to_frame() + expected = self.panel.to_frame().dropna(how='any') + assert_frame_equal(filtered, expected) + + # unfiltered + unfiltered = self.panel.to_frame(filter_observations=False) + assert_panel_equal(unfiltered.to_panel(), self.panel) + + # names + self.assertEqual(unfiltered.index.names, ['major', 'minor']) + + def test_to_frame_mixed(self): + panel = self.panel.fillna(0) + panel['str'] = 'foo' + panel['bool'] = panel['ItemA'] > 0 + + lp = panel.to_frame() + wp = lp.to_panel() + self.assertEqual(wp['bool'].values.dtype, np.bool_) + assert_frame_equal(wp['bool'], panel['bool']) + + def test_filter(self): + pass + + def test_apply(self): + pass + + def test_compound(self): + compounded = self.panel.compound() + + assert_series_equal(compounded['ItemA'], + (1 + self.panel['ItemA']).product(0) - 1) + + def test_shift(self): + # major + idx = self.panel.major_axis[0] + idx_lag = self.panel.major_axis[1] + + shifted = self.panel.shift(1) + + assert_frame_equal(self.panel.major_xs(idx), + shifted.major_xs(idx_lag)) + + # minor + idx = self.panel.minor_axis[0] + idx_lag = self.panel.minor_axis[1] + + shifted = self.panel.shift(1, axis='minor') + + assert_frame_equal(self.panel.minor_xs(idx), + shifted.minor_xs(idx_lag)) + + self.assertRaises(Exception, self.panel.shift, 1, axis='items') + + def test_multiindex_get(self): + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b',2)], + names=['first', 'second']) + wp = Panel(np.random.random((4,5,5)), + items=ind, + major_axis=np.arange(5), + minor_axis=np.arange(5)) + f1 = wp['a'] + f2 = wp.ix['a'] + assert_panel_equal(f1, f2) + + self.assert_((f1.items == [1, 2]).all()) + self.assert_((f2.items == [1, 2]).all()) + + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + names=['first', 'second']) + + def test_multiindex_blocks(self): + ind = MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)], + names=['first', 'second']) + wp = Panel(self.panel._data) + wp.items = ind + f1 = wp['a'] + self.assert_((f1.items == [1, 2]).all()) + + f1 = wp[('b',1)] + self.assert_((f1.columns == ['A', 'B', 'C', 'D']).all()) + + def test_repr_empty(self): + empty = Panel() + repr(empty) + + def test_rename(self): + mapper = { + 'ItemA' : 'foo', + 'ItemB' : 'bar', + 'ItemC' : 'baz' + } + + renamed = self.panel.rename_axis(mapper, axis=0) + exp = Index(['foo', 'bar', 'baz']) + self.assert_(renamed.items.equals(exp)) + + renamed = self.panel.rename_axis(str.lower, axis=2) + exp = Index(['a', 'b', 'c', 'd']) + self.assert_(renamed.minor_axis.equals(exp)) + + # don't copy + renamed_nocopy = self.panel.rename_axis(mapper, axis=0, copy=False) + renamed_nocopy['foo'] = 3. + self.assert_((self.panel['ItemA'].values == 3).all()) + + def test_get_attr(self): + assert_frame_equal(self.panel['ItemA'], self.panel.ItemA) + + def test_group_agg(self): + values = np.ones((10, 2)) * np.arange(10).reshape((10, 1)) + bounds = np.arange(5) * 2 + f = lambda x: x.mean(axis=0) + + agged = group_agg(values, bounds, f) + + assert(agged[1][0] == 2.5) + assert(agged[2][0] == 4.5) + + # test a function that doesn't aggregate + f2 = lambda x: np.zeros((2,2)) + self.assertRaises(Exception, group_agg, values, bounds, f2) + + def test_from_frame_level1_unsorted(self): + tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), + ('AAPL', 1), ('MSFT', 1)] + midx = MultiIndex.from_tuples(tuples) + df = DataFrame(np.random.rand(5,4), index=midx) + p = df.to_panel() + assert_frame_equal(p.minor_xs(2), df.ix[:,2].sort_index()) + + def test_to_excel(self): + try: + import os + import xlwt + import xlrd + import openpyxl + from pandas.io.parsers import ExcelFile + except ImportError: + raise nose.SkipTest + + for ext in ['xls', 'xlsx']: + path = '__tmp__.' + ext + self.panel.to_excel(path) + reader = ExcelFile(path) + for item, df in self.panel.iteritems(): + recdf = reader.parse(str(item),index_col=0) + assert_frame_equal(df, recdf) + os.remove(path) + +class TestLongPanel(unittest.TestCase): + """ + LongPanel no longer exists, but... + """ + + def setUp(self): + panel = tm.makePanel() + tm.add_nans(panel) + + self.panel = panel.to_frame() + self.unfiltered_panel = panel.to_frame(filter_observations=False) + + def test_ops_differently_indexed(self): + # trying to set non-identically indexed panel + wp = self.panel.to_panel() + wp2 = wp.reindex(major=wp.major_axis[:-1]) + lp2 = wp2.to_frame() + + result = self.panel + lp2 + assert_frame_equal(result.reindex(lp2.index), lp2 * 2) + + # careful, mutation + self.panel['foo'] = lp2['ItemA'] + assert_series_equal(self.panel['foo'].reindex(lp2.index), + lp2['ItemA']) + + def test_ops_scalar(self): + result = self.panel.mul(2) + expected = DataFrame.__mul__(self.panel, 2) + assert_frame_equal(result, expected) + + def test_combineFrame(self): + wp = self.panel.to_panel() + result = self.panel.add(wp['ItemA'].stack(), axis=0) + assert_frame_equal(result.to_panel()['ItemA'], wp['ItemA'] * 2) + + def test_combinePanel(self): + wp = self.panel.to_panel() + result = self.panel.add(self.panel) + wide_result = result.to_panel() + assert_frame_equal(wp['ItemA'] * 2, wide_result['ItemA']) + + # one item + result = self.panel.add(self.panel.filter(['ItemA'])) + + def test_combine_scalar(self): + result = self.panel.mul(2) + expected = DataFrame(self.panel._data) * 2 + assert_frame_equal(result, expected) + + def test_combine_series(self): + s = self.panel['ItemA'][:10] + result = self.panel.add(s, axis=0) + expected = DataFrame.add(self.panel, s, axis=0) + assert_frame_equal(result, expected) + + s = self.panel.ix[5] + result = self.panel + s + expected = DataFrame.add(self.panel, s, axis=1) + assert_frame_equal(result, expected) + + def test_operators(self): + wp = self.panel.to_panel() + result = (self.panel + 1).to_panel() + assert_frame_equal(wp['ItemA'] + 1, result['ItemA']) + + def test_sort(self): + def is_sorted(arr): + return (arr[1:] > arr[:-1]).any() + + sorted_minor = self.panel.sortlevel(level=1) + self.assert_(is_sorted(sorted_minor.index.labels[1])) + + sorted_major = sorted_minor.sortlevel(level=0) + self.assert_(is_sorted(sorted_major.index.labels[0])) + + def test_to_string(self): + from pandas.util.py3compat import StringIO + + buf = StringIO() + self.panel.to_string(buf) + + def test_truncate(self): + dates = self.panel.index.levels[0] + start, end = dates[1], dates[5] + + trunced = self.panel.truncate(start, end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(start, end) + + assert_frame_equal(trunced['ItemA'], expected) + + trunced = self.panel.truncate(before=start).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(before=start) + + assert_frame_equal(trunced['ItemA'], expected) + + trunced = self.panel.truncate(after=end).to_panel() + expected = self.panel.to_panel()['ItemA'].truncate(after=end) + + assert_frame_equal(trunced['ItemA'], expected) + + # truncate on dates that aren't in there + wp = self.panel.to_panel() + new_index = wp.major_axis[::5] + + wp2 = wp.reindex(major=new_index) + + lp2 = wp2.to_frame() + lp_trunc = lp2.truncate(wp.major_axis[2], wp.major_axis[-2]) + + wp_trunc = wp2.truncate(wp.major_axis[2], wp.major_axis[-2]) + + assert_panel_equal(wp_trunc, lp_trunc.to_panel()) + + # throw proper exception + self.assertRaises(Exception, lp2.truncate, wp.major_axis[-2], + wp.major_axis[2]) + + def test_axis_dummies(self): + from pandas.core.reshape import make_axis_dummies + + minor_dummies = make_axis_dummies(self.panel, 'minor') + self.assertEqual(len(minor_dummies.columns), + len(self.panel.index.levels[1])) + + major_dummies = make_axis_dummies(self.panel, 'major') + self.assertEqual(len(major_dummies.columns), + len(self.panel.index.levels[0])) + + mapping = {'A' : 'one', + 'B' : 'one', + 'C' : 'two', + 'D' : 'two'} + + transformed = make_axis_dummies(self.panel, 'minor', + transform=mapping.get) + self.assertEqual(len(transformed.columns), 2) + self.assert_(np.array_equal(transformed.columns, ['one', 'two'])) + + # TODO: test correctness + + def test_get_dummies(self): + from pandas.core.reshape import get_dummies, make_axis_dummies + + self.panel['Label'] = self.panel.index.labels[1] + minor_dummies = make_axis_dummies(self.panel, 'minor') + dummies = get_dummies(self.panel['Label']) + self.assert_(np.array_equal(dummies.values, minor_dummies.values)) + + def test_apply(self): + # ufunc + applied = self.panel.apply(np.sqrt) + self.assert_(assert_almost_equal(applied.values, + np.sqrt(self.panel.values))) + + def test_mean(self): + means = self.panel.mean(level='minor') + + # test versus Panel version + wide_means = self.panel.to_panel().mean('major') + assert_frame_equal(means, wide_means) + + def test_sum(self): + sums = self.panel.sum(level='minor') + + # test versus Panel version + wide_sums = self.panel.to_panel().sum('major') + assert_frame_equal(sums, wide_sums) + + def test_count(self): + index = self.panel.index + + major_count = self.panel.count(level=0)['ItemA'] + labels = index.labels[0] + for i, idx in enumerate(index.levels[0]): + self.assertEqual(major_count[i], (labels == i).sum()) + + minor_count = self.panel.count(level=1)['ItemA'] + labels = index.labels[1] + for i, idx in enumerate(index.levels[1]): + self.assertEqual(minor_count[i], (labels == i).sum()) + + def test_join(self): + lp1 = self.panel.filter(['ItemA', 'ItemB']) + lp2 = self.panel.filter(['ItemC']) + + joined = lp1.join(lp2) + + self.assertEqual(len(joined.columns), 3) + + self.assertRaises(Exception, lp1.join, + self.panel.filter(['ItemB', 'ItemC'])) + + def test_pivot(self): + from pandas.core.reshape import _slow_pivot + + one, two, three = (np.array([1, 2, 3, 4, 5]), + np.array(['a', 'b', 'c', 'd', 'e']), + np.array([1, 2, 3, 5, 4.])) + df = pivot(one, two, three) + self.assertEqual(df['a'][1], 1) + self.assertEqual(df['b'][2], 2) + self.assertEqual(df['c'][3], 3) + self.assertEqual(df['d'][4], 5) + self.assertEqual(df['e'][5], 4) + assert_frame_equal(df, _slow_pivot(one, two, three)) + + # weird overlap, TODO: test? + a, b, c = (np.array([1, 2, 3, 4, 4]), + np.array(['a', 'a', 'a', 'a', 'a']), + np.array([1., 2., 3., 4., 5.])) + self.assertRaises(Exception, pivot, a, b, c) + + # corner case, empty + df = pivot(np.array([]), np.array([]), np.array([])) + +def test_monotonic(): + pos = np.array([1, 2, 3, 5]) + + assert panelm._monotonic(pos) + + neg = np.array([1, 2, 3, 4, 3]) + + assert not panelm._monotonic(neg) + + neg2 = np.array([5, 1, 2, 3, 4, 5]) + + assert not panelm._monotonic(neg2) + +def test_panel_index(): + index = panelm.panel_index([1,2,3,4], [1,2,3]) + expected = MultiIndex.from_arrays([np.tile([1,2,3,4], 3), + np.repeat([1,2,3], 4)]) + assert(index.equals(expected)) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_reshape.py b/pandas/tests/test_reshape.py new file mode 100644 index 00000000..3cb600ff --- /dev/null +++ b/pandas/tests/test_reshape.py @@ -0,0 +1,128 @@ +# pylint: disable-msg=W0612,E1101 +from copy import deepcopy +from datetime import datetime, timedelta +from StringIO import StringIO +import cPickle as pickle +import operator +import os +import unittest + +import nose + +from pandas import DataFrame + +from numpy import nan +import numpy as np + +from pandas.core.reshape import melt, convert_dummies, lreshape +import pandas.util.testing as tm + +def test_melt(): + df = tm.makeTimeDataFrame()[:10] + df['id1'] = (df['A'] > 0).astype(int) + df['id2'] = (df['B'] > 0).astype(int) + + molten1 = melt(df) + molten2 = melt(df, id_vars=['id1']) + molten3 = melt(df, id_vars=['id1', 'id2']) + +def test_convert_dummies(): + df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B' : ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C' : np.random.randn(8), + 'D' : np.random.randn(8)}) + + result = convert_dummies(df, ['A', 'B']) + result2 = convert_dummies(df, ['A', 'B'], prefix_sep='.') + + expected = DataFrame({'A_foo' : [1, 0, 1, 0, 1, 0, 1, 1], + 'A_bar' : [0, 1, 0, 1, 0, 1, 0, 0], + 'B_one' : [1, 1, 0, 0, 0, 0, 1, 0], + 'B_two' : [0, 0, 1, 0, 1, 1, 0, 0], + 'B_three' : [0, 0, 0, 1, 0, 0, 0, 1], + 'C' : df['C'].values, + 'D' : df['D'].values}, + columns=result.columns, dtype=float) + expected2 = expected.rename(columns=lambda x: x.replace('_', '.')) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result2, expected2) + +class Test_lreshape(unittest.TestCase): + + def test_pairs(self): + data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009'], + 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], + 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt1': [1823, 3338, 1549, 3298, 4306], + 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], + 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} + + df = DataFrame(data) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + result = lreshape(df, spec) + + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', '08jan2009', + '30dec2008', '21dec2008', '11jan2009', + '08jan2009', '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, + 1454, 3139, 4133, 1766, 3139, 4133], + 'id': [101, 102, 103, 104, 105, 101, + 103, 104, 105, 101, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Male', + 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', + '20jan2009', '21jan2009', '22jan2009', '31dec2008', + '03feb2009', '05feb2009', '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + result = lreshape(df, spec, dropna=False) + exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009', + '08jan2009', '20dec2008', '30dec2008', + '21dec2008', '11jan2009'], + 'birthwt': [1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133, + 1766, 3301, 1454, 3139, 4133], + 'id': [101, 102, 103, 104, 105, + 101, 102, 103, 104, 105, + 101, 102, 103, 104, 105], + 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female', + 'Male', 'Female', 'Female', 'Female', 'Female'], + 'visitdt': ['11jan2009', '22dec2008', '04jan2009', + '29dec2008', '20jan2009', + '21jan2009', nan, '22jan2009', + '31dec2008', '03feb2009', + '05feb2009', nan, nan, '02jan2009', '15feb2009'], + 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, + nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, + 3377.0, 4805.0]} + exp = DataFrame(exp_data, columns=result.columns) + tm.assert_frame_equal(result, exp) + + spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], + 'wt': ['wt%d' % i for i in range(1, 4)]} + self.assertRaises(ValueError, lreshape, df, spec) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py new file mode 100644 index 00000000..cf2f2a75 --- /dev/null +++ b/pandas/tests/test_series.py @@ -0,0 +1,2978 @@ +# pylint: disable-msg=E1101,W0612 + +from datetime import datetime, timedelta, date +import os +import operator +import unittest + +import nose + +from numpy import nan +import numpy as np +import numpy.ma as ma + +from pandas import (Index, Series, TimeSeries, DataFrame, isnull, notnull, + bdate_range, date_range) +from pandas.core.index import MultiIndex +from pandas.tseries.index import Timestamp, DatetimeIndex +import pandas.core.series as smod +import pandas.lib as lib + +import pandas.core.datetools as datetools +import pandas.core.nanops as nanops + +from pandas.util.py3compat import StringIO +from pandas.util import py3compat +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +def _skip_if_no_scipy(): + try: + import scipy.stats + except ImportError: + raise nose.SkipTest + + +#------------------------------------------------------------------------------- +# Series test cases + +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + +class CheckNameIntegration(object): + + def test_scalarop_preserve_name(self): + result = self.ts * 2 + self.assertEquals(result.name, self.ts.name) + + def test_copy_name(self): + result = self.ts.copy() + self.assertEquals(result.name, self.ts.name) + + # def test_copy_index_name_checking(self): + # # don't want to be able to modify the index stored elsewhere after + # # making a copy + + # self.ts.index.name = None + # cp = self.ts.copy() + # cp.index.name = 'foo' + # self.assert_(self.ts.index.name is None) + + def test_append_preserve_name(self): + result = self.ts[:5].append(self.ts[5:]) + self.assertEquals(result.name, self.ts.name) + + def test_binop_maybe_preserve_name(self): + # names match, preserve + result = self.ts * self.ts + self.assertEquals(result.name, self.ts.name) + + result = self.ts * self.ts[:-2] + self.assertEquals(result.name, self.ts.name) + + # names don't match, don't preserve + cp = self.ts.copy() + cp.name = 'something else' + result = self.ts + cp + self.assert_(result.name is None) + + def test_combine_first_name(self): + result = self.ts.combine_first(self.ts[:5]) + self.assertEquals(result.name, self.ts.name) + + def test_getitem_preserve_name(self): + result = self.ts[self.ts > 0] + self.assertEquals(result.name, self.ts.name) + + result = self.ts[[0, 2, 4]] + self.assertEquals(result.name, self.ts.name) + + result = self.ts[5:10] + self.assertEquals(result.name, self.ts.name) + + def test_multilevel_name_print(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + s = Series(range(0,len(index)), index=index, name='sth') + expected = ["first second", + "foo one 0", + " two 1", + " three 2", + "bar one 3", + " two 4", + "baz two 5", + " three 6", + "qux one 7", + " two 8", + " three 9", + "Name: sth"] + expected = "\n".join(expected) + self.assertEquals(repr(s), expected) + + def test_multilevel_preserve_name(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + s = Series(np.random.randn(len(index)), index=index, name='sth') + + result = s['foo'] + result2 = s.ix['foo'] + self.assertEquals(result.name, s.name) + self.assertEquals(result2.name, s.name) + + def test_name_printing(self): + # test small series + s = Series([0, 1, 2]) + s.name = "test" + self.assert_("Name: test" in repr(s)) + s.name = None + self.assert_(not "Name:" in repr(s)) + # test big series (diff code path) + s = Series(range(0,1000)) + s.name = "test" + self.assert_("Name: test" in repr(s)) + s.name = None + self.assert_(not "Name:" in repr(s)) + + def test_pickle_preserve_name(self): + unpickled = self._pickle_roundtrip(self.ts) + self.assertEquals(unpickled.name, self.ts.name) + + def _pickle_roundtrip(self, obj): + obj.save('__tmp__') + unpickled = Series.load('__tmp__') + os.remove('__tmp__') + return unpickled + + def test_argsort_preserve_name(self): + result = self.ts.argsort() + self.assertEquals(result.name, self.ts.name) + + def test_sort_index_name(self): + result = self.ts.sort_index(ascending=False) + self.assertEquals(result.name, self.ts.name) + + def test_to_sparse_pass_name(self): + result = self.ts.to_sparse() + self.assertEquals(result.name, self.ts.name) + +class TestNanops(unittest.TestCase): + + def test_comparisons(self): + left = np.random.randn(10) + right = np.random.randn(10) + left[:3] = np.nan + + result = nanops.nangt(left, right) + expected = (left > right).astype('O') + expected[:3] = np.nan + + assert_almost_equal(result, expected) + + def test_none_comparison(self): + # bug brought up by #1079 + s = Series(np.random.randn(10), index=range(0, 20, 2)) + self.assertRaises(TypeError, s.__eq__, None) + + def test_sum_zero(self): + arr = np.array([]) + self.assert_(nanops.nansum(arr) == 0) + + arr = np.empty((10, 0)) + self.assert_((nanops.nansum(arr, axis=1) == 0).all()) + + # GH #844 + s = Series([], index=[]) + self.assert_(s.sum() == 0) + + df = DataFrame(np.empty((10, 0))) + self.assert_((df.sum(1) == 0).all()) + + +class SafeForSparse(object): + pass + +class TestSeries(unittest.TestCase, CheckNameIntegration): + + def setUp(self): + self.ts = tm.makeTimeSeries() + self.ts.name = 'ts' + + self.series = tm.makeStringSeries() + self.series.name = 'series' + + self.objSeries = tm.makeObjectSeries() + self.objSeries.name = 'objects' + + self.empty = Series([], index=[]) + + def test_constructor(self): + # Recognize TimeSeries + self.assert_(isinstance(self.ts, TimeSeries)) + + # Pass in Series + derived = Series(self.ts) + self.assert_(isinstance(derived, TimeSeries)) + + self.assert_(tm.equalContents(derived.index, self.ts.index)) + # Ensure new index is not created + self.assertEquals(id(self.ts.index), id(derived.index)) + + # Pass in scalar + scalar = Series(0.5) + self.assert_(isinstance(scalar, float)) + + # Mixed type Series + mixed = Series(['hello', np.NaN], index=[0, 1]) + self.assert_(mixed.dtype == np.object_) + self.assert_(mixed[1] is np.NaN) + + self.assert_(not isinstance(self.empty, TimeSeries)) + self.assert_(not isinstance(Series({}), TimeSeries)) + + self.assertRaises(Exception, Series, np.random.randn(3, 3), + index=np.arange(3)) + + def test_constructor_empty(self): + empty = Series() + empty2 = Series([]) + assert_series_equal(empty, empty2) + + empty = Series(index=range(10)) + empty2 = Series(np.nan, index=range(10)) + assert_series_equal(empty, empty2) + + def test_constructor_maskedarray(self): + data = ma.masked_all((3,), dtype=float) + result = Series(data) + expected = Series([nan, nan, nan]) + assert_series_equal(result, expected) + + data[0] = 0.0 + data[2] = 2.0 + index = ['a', 'b', 'c'] + result = Series(data, index=index) + expected = Series([0.0, nan, 2.0], index=index) + assert_series_equal(result, expected) + + def test_constructor_default_index(self): + s = Series([0, 1, 2]) + assert_almost_equal(s.index, np.arange(3)) + + def test_constructor_corner(self): + df = tm.makeTimeDataFrame() + objs = [df, df] + s = Series(objs, index=[0, 1]) + self.assert_(isinstance(s, Series)) + + def test_constructor_sanitize(self): + s = Series(np.array([1., 1., 8.]), dtype='i8') + self.assertEquals(s.dtype, np.dtype('i8')) + + s = Series(np.array([1., 1., np.nan]), copy=True, dtype='i8') + self.assertEquals(s.dtype, np.dtype('f8')) + + def test_constructor_pass_none(self): + s = Series(None, index=range(5)) + self.assert_(s.dtype == np.float64) + + s = Series(None, index=range(5), dtype=object) + self.assert_(s.dtype == np.object_) + + def test_constructor_cast(self): + self.assertRaises(ValueError, Series, ['a', 'b', 'c'], dtype=float) + + def test_constructor_dict(self): + d = {'a' : 0., 'b' : 1., 'c' : 2.} + result = Series(d, index=['b', 'c', 'd', 'a']) + expected = Series([1, 2, nan, 0], index=['b', 'c', 'd', 'a']) + assert_series_equal(result, expected) + + pidx = tm.makePeriodIndex(100) + d = {pidx[0] : 0, pidx[1] : 1} + result = Series(d, index=pidx) + expected = Series(np.nan, pidx) + expected.ix[0] = 0 + expected.ix[1] = 1 + assert_series_equal(result, expected) + + def test_constructor_subclass_dict(self): + data = tm.TestSubDict((x, 10.0 * x) for x in xrange(10)) + series = Series(data) + refseries = Series(dict(data.iteritems())) + assert_series_equal(refseries, series) + + def test_constructor_list_of_tuples(self): + data = [(1, 1), (2, 2), (2, 3)] + s = Series(data) + self.assertEqual(list(s), data) + + def test_constructor_tuple_of_tuples(self): + data = ((1, 1), (2, 2), (2, 3)) + s = Series(data) + self.assertEqual(tuple(s), data) + + def test_fromDict(self): + data = {'a' : 0, 'b' : 1, 'c' : 2, 'd' : 3} + + series = Series(data) + self.assert_(tm.is_sorted(series.index)) + + data = {'a' : 0, 'b' : '1', 'c' : '2', 'd' : datetime.now()} + series = Series(data) + self.assert_(series.dtype == np.object_) + + data = {'a' : 0, 'b' : '1', 'c' : '2', 'd' : '3'} + series = Series(data) + self.assert_(series.dtype == np.object_) + + data = {'a' : '0', 'b' : '1'} + series = Series(data, dtype=float) + self.assert_(series.dtype == np.float64) + + def test_from_json_to_json(self): + raise nose.SkipTest + + def _check_orient(series, orient, dtype=None, numpy=True): + series = series.sort_index() + unser = Series.from_json(series.to_json(orient=orient), + orient=orient, numpy=numpy, dtype=dtype) + unser = unser.sort_index() + if series.index.dtype.type == np.datetime64: + unser.index = DatetimeIndex(unser.index.values.astype('i8')) + if orient == "records" or orient == "values": + assert_almost_equal(series.values, unser.values) + else: + try: + assert_series_equal(series, unser) + except: + raise + if orient == "split": + self.assert_(series.name == unser.name) + + def _check_all_orients(series, dtype=None): + _check_orient(series, "columns", dtype=dtype) + _check_orient(series, "records", dtype=dtype) + _check_orient(series, "split", dtype=dtype) + _check_orient(series, "index", dtype=dtype) + _check_orient(series, "values", dtype=dtype) + + _check_orient(series, "columns", dtype=dtype, numpy=False) + _check_orient(series, "records", dtype=dtype, numpy=False) + _check_orient(series, "split", dtype=dtype, numpy=False) + _check_orient(series, "index", dtype=dtype, numpy=False) + _check_orient(series, "values", dtype=dtype, numpy=False) + + # basic + _check_all_orients(self.series) + self.assertEqual(self.series.to_json(), + self.series.to_json(orient="index")) + + objSeries = Series([str(d) for d in self.objSeries], + index=self.objSeries.index, + name=self.objSeries.name) + _check_all_orients(objSeries) + _check_all_orients(self.empty) + _check_all_orients(self.ts) + + # dtype + s = Series(range(6), index=['a','b','c','d','e','f']) + _check_all_orients(Series(s, dtype=np.float64), dtype=np.float64) + _check_all_orients(Series(s, dtype=np.int), dtype=np.int) + + def test_to_json_except(self): + raise nose.SkipTest + s = Series([1, 2, 3]) + self.assertRaises(ValueError, s.to_json, orient="garbage") + + def test_setindex(self): + # wrong type + series = self.series.copy() + self.assertRaises(TypeError, setattr, series, 'index', None) + + # wrong length + series = self.series.copy() + self.assertRaises(AssertionError, setattr, series, 'index', + np.arange(len(series) - 1)) + + # works + series = self.series.copy() + series.index = np.arange(len(series)) + self.assert_(isinstance(series.index, Index)) + + def test_array_finalize(self): + pass + + def test_fromValue(self): + nans = Series(np.NaN, index=self.ts.index) + self.assert_(nans.dtype == np.float_) + self.assertEqual(len(nans), len(self.ts)) + + strings = Series('foo', index=self.ts.index) + self.assert_(strings.dtype == np.object_) + self.assertEqual(len(strings), len(self.ts)) + + d = datetime.now() + dates = Series(d, index=self.ts.index) + self.assert_(dates.dtype == np.object_) + self.assertEqual(len(dates), len(self.ts)) + + def test_contains(self): + tm.assert_contains_all(self.ts.index, self.ts) + + def test_pickle(self): + unp_series = self._pickle_roundtrip(self.series) + unp_ts = self._pickle_roundtrip(self.ts) + assert_series_equal(unp_series, self.series) + assert_series_equal(unp_ts, self.ts) + + def _pickle_roundtrip(self, obj): + obj.save('__tmp__') + unpickled = Series.load('__tmp__') + os.remove('__tmp__') + return unpickled + + def test_getitem_get(self): + idx1 = self.series.index[5] + idx2 = self.objSeries.index[5] + + self.assertEqual(self.series[idx1], self.series.get(idx1)) + self.assertEqual(self.objSeries[idx2], self.objSeries.get(idx2)) + + self.assertEqual(self.series[idx1], self.series[5]) + self.assertEqual(self.objSeries[idx2], self.objSeries[5]) + + self.assert_(self.series.get(-1) is None) + self.assertEqual(self.series[5], self.series.get(self.series.index[5])) + + # missing + d = self.ts.index[0] - datetools.bday + self.assertRaises(KeyError, self.ts.__getitem__, d) + + def test_iget(self): + s = Series(np.random.randn(10), index=range(0, 20, 2)) + for i in range(len(s)): + result = s.iget(i) + exp = s[s.index[i]] + assert_almost_equal(result, exp) + + # pass a slice + result = s.iget(slice(1, 3)) + expected = s.ix[2:4] + assert_series_equal(result, expected) + + # test slice is a view + result[:] = 0 + self.assert_((s[1:3] == 0).all()) + + # list of integers + result = s.iget([0, 2, 3, 4, 5]) + expected = s.reindex(s.index[[0, 2, 3, 4, 5]]) + assert_series_equal(result, expected) + + def test_getitem_regression(self): + s = Series(range(5), index=range(5)) + result = s[range(5)] + assert_series_equal(result, s) + + def test_getitem_setitem_slice_bug(self): + s = Series(range(10), range(10)) + result = s[-12:] + assert_series_equal(result, s) + + result = s[-7:] + assert_series_equal(result, s[3:]) + + result = s[:-12] + assert_series_equal(result, s[:0]) + + s = Series(range(10), range(10)) + s[-12:] = 0 + self.assert_((s == 0).all()) + + s[:-12] = 5 + self.assert_((s == 0).all()) + + def test_getitem_int64(self): + idx = np.int64(5) + self.assertEqual(self.ts[idx], self.ts[5]) + + def test_getitem_fancy(self): + slice1 = self.series[[1,2,3]] + slice2 = self.objSeries[[1,2,3]] + self.assertEqual(self.series.index[2], slice1.index[1]) + self.assertEqual(self.objSeries.index[2], slice2.index[1]) + self.assertEqual(self.series[2], slice1[1]) + self.assertEqual(self.objSeries[2], slice2[1]) + + def test_getitem_boolean(self): + s = self.series + mask = s > s.median() + + # passing list is OK + result = s[list(mask)] + expected = s[mask] + assert_series_equal(result, expected) + self.assert_(np.array_equal(result.index, s.index[mask])) + + def test_getitem_generator(self): + gen = (x > 0 for x in self.series) + result = self.series[gen] + result2 = self.series[iter(self.series > 0)] + expected = self.series[self.series > 0] + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + def test_getitem_boolean_object(self): + # using column from DataFrame + s = self.series + mask = s > s.median() + omask = mask.astype(object) + + # getitem + result = s[omask] + expected = s[mask] + assert_series_equal(result, expected) + + # setitem + cop = s.copy() + cop[omask] = 5 + s[mask] = 5 + assert_series_equal(cop, s) + + # nans raise exception + omask[5:10] = np.nan + self.assertRaises(Exception, s.__getitem__, omask) + self.assertRaises(Exception, s.__setitem__, omask, 5) + + def test_getitem_setitem_boolean_corner(self): + ts = self.ts + mask_shifted = ts.shift(1, freq=datetools.bday) > ts.median() + self.assertRaises(Exception, ts.__getitem__, mask_shifted) + self.assertRaises(Exception, ts.__setitem__, mask_shifted, 1) + + self.assertRaises(Exception, ts.ix.__getitem__, mask_shifted) + self.assertRaises(Exception, ts.ix.__setitem__, mask_shifted, 1) + + def test_getitem_setitem_slice_integers(self): + s = Series(np.random.randn(8), index=[2, 4, 6, 8, 10, 12, 14, 16]) + + result = s[:4] + expected = s.reindex([2, 4, 6, 8]) + assert_series_equal(result, expected) + + s[:4] = 0 + self.assert_((s[:4] == 0).all()) + self.assert_(not (s[4:] == 0).any()) + + def test_getitem_out_of_bounds(self): + # don't segfault, GH #495 + self.assertRaises(IndexError, self.ts.__getitem__, len(self.ts)) + + # GH #917 + s = Series([]) + self.assertRaises(IndexError, s.__getitem__, -1) + + def test_getitem_setitem_integers(self): + # caused bug without test + s = Series([1,2,3], ['a','b','c']) + + self.assertEqual(s.ix[0], s['a']) + s.ix[0] = 5 + self.assertAlmostEqual(s['a'], 5) + + def test_getitem_box_float64(self): + value = self.ts[5] + self.assert_(isinstance(value, np.float64)) + + def test_getitem_ambiguous_keyerror(self): + s = Series(range(10), index=range(0, 20, 2)) + self.assertRaises(KeyError, s.__getitem__, 1) + self.assertRaises(KeyError, s.ix.__getitem__, 1) + + def test_setitem_ambiguous_keyerror(self): + s = Series(range(10), index=range(0, 20, 2)) + self.assertRaises(KeyError, s.__setitem__, 1, 5) + self.assertRaises(KeyError, s.ix.__setitem__, 1, 5) + + def test_setitem_float_labels(self): + # note labels are floats + s = Series(['a','b','c'],index=[0,0.5,1]) + tmp = s.copy() + + s.ix[1] = 'zoo' + tmp.values[1] = 'zoo' + + assert_series_equal(s, tmp) + + def test_slice(self): + numSlice = self.series[10:20] + numSliceEnd = self.series[-10:] + objSlice = self.objSeries[10:20] + + self.assert_(self.series.index[9] not in numSlice.index) + self.assert_(self.objSeries.index[9] not in objSlice.index) + + self.assertEqual(len(numSlice), len(numSlice.index)) + self.assertEqual(self.series[numSlice.index[0]], + numSlice[numSlice.index[0]]) + + self.assertEqual(numSlice.index[1], self.series.index[11]) + + self.assert_(tm.equalContents(numSliceEnd, + np.array(self.series)[-10:])) + + # test return view + sl = self.series[10:20] + sl[:] = 0 + self.assert_((self.series[10:20] == 0).all()) + + def test_slice_can_reorder_not_uniquely_indexed(self): + s = Series(1, index=['a', 'a', 'b', 'b', 'c']) + result = s[::-1] # it works! + + def test_slice_float_get_set(self): + result = self.ts[4.0:10.0] + expected = self.ts[4:10] + assert_series_equal(result, expected) + + self.ts[4.0:10.0] = 0 + self.assert_((self.ts[4:10] == 0).all()) + + self.assertRaises(TypeError, self.ts.__getitem__, slice(4.5, 10.0)) + self.assertRaises(TypeError, self.ts.__setitem__, slice(4.5, 10.0), 0) + + def test_slice_floats2(self): + s = Series(np.random.rand(10), index=np.arange(10,20,dtype=float)) + + self.assert_(len(s.ix[12.0:]) == 8) + self.assert_(len(s.ix[12.5:]) == 7) + + i = np.arange(10,20,dtype=float) + i[2] = 12.2 + s.index = i + self.assert_(len(s.ix[12.0:]) == 8) + self.assert_(len(s.ix[12.5:]) == 7) + + def test_slice_float64(self): + values = np.arange(10., 50., 2) + index = Index(values) + + start, end = values[[5, 15]] + + s = Series(np.random.randn(20), index=index) + + result = s[start:end] + expected = s.ix[5:16] + assert_series_equal(result, expected) + + result = s.ix[start:end] + assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(20, 3), index=index) + + result = df[start:end] + expected = df.ix[5:16] + tm.assert_frame_equal(result, expected) + + result = df.ix[start:end] + tm.assert_frame_equal(result, expected) + + def test_setitem(self): + self.ts[self.ts.index[5]] = np.NaN + self.ts[[1,2,17]] = np.NaN + self.ts[6] = np.NaN + self.assert_(np.isnan(self.ts[6])) + self.assert_(np.isnan(self.ts[2])) + self.ts[np.isnan(self.ts)] = 5 + self.assert_(not np.isnan(self.ts[2])) + + # caught this bug when writing tests + series = Series(tm.makeIntIndex(20).astype(float), + index=tm.makeIntIndex(20)) + + series[::2] = 0 + self.assert_((series[::2] == 0).all()) + + # set item that's not contained + self.assertRaises(Exception, self.series.__setitem__, + 'foobar', 1) + + def test_set_value(self): + idx = self.ts.index[10] + res = self.ts.set_value(idx, 0) + self.assert_(res is self.ts) + self.assertEqual(self.ts[idx], 0) + + res = self.series.set_value('foobar', 0) + self.assert_(res is not self.series) + self.assert_(res.index[-1] == 'foobar') + self.assertEqual(res['foobar'], 0) + + def test_setslice(self): + sl = self.ts[5:20] + self.assertEqual(len(sl), len(sl.index)) + self.assert_(sl.index.is_unique) + + def test_basic_getitem_setitem_corner(self): + # invalid tuples, e.g. self.ts[:, None] vs. self.ts[:, 2] + self.assertRaises(Exception, self.ts.__getitem__, + (slice(None, None), 2)) + self.assertRaises(Exception, self.ts.__setitem__, + (slice(None, None), 2), 2) + + # weird lists. [slice(0, 5)] will work but not two slices + result = self.ts[[slice(None, 5)]] + expected = self.ts[:5] + assert_series_equal(result, expected) + + # OK + self.assertRaises(Exception, self.ts.__getitem__, + [5, slice(None, None)]) + self.assertRaises(Exception, self.ts.__setitem__, + [5, slice(None, None)], 2) + + def test_reshape_non_2d(self): + x = Series(np.random.random(201), name='x') + self.assertRaises(TypeError, x.reshape, (len(x),)) + + def test_reshape_2d_return_array(self): + x = Series(np.random.random(201), name='x') + result = x.reshape((-1,1)) + self.assert_(not isinstance(result, Series)) + + result2 = np.reshape(x, (-1,1)) + self.assert_(not isinstance(result, Series)) + + result = x[:, None] + expected = x.reshape((-1,1)) + assert_almost_equal(result, expected) + + def test_basic_getitem_with_labels(self): + indices = self.ts.index[[5, 10, 15]] + + result = self.ts[indices] + expected = self.ts.reindex(indices) + assert_series_equal(result, expected) + + result = self.ts[indices[0]:indices[2]] + expected = self.ts.ix[indices[0]:indices[2]] + assert_series_equal(result, expected) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=range(0, 20, 2)) + inds = [0, 2, 5, 7, 8] + arr_inds = np.array([0, 2, 5, 7, 8]) + result = s[inds] + expected = s.reindex(inds) + assert_series_equal(result, expected) + + result = s[arr_inds] + expected = s.reindex(arr_inds) + assert_series_equal(result, expected) + + def test_basic_setitem_with_labels(self): + indices = self.ts.index[[5, 10, 15]] + + cp = self.ts.copy() + exp = self.ts.copy() + cp[indices] = 0 + exp.ix[indices] = 0 + assert_series_equal(cp, exp) + + cp = self.ts.copy() + exp = self.ts.copy() + cp[indices[0]:indices[2]] = 0 + exp.ix[indices[0]:indices[2]] = 0 + assert_series_equal(cp, exp) + + # integer indexes, be careful + s = Series(np.random.randn(10), index=range(0, 20, 2)) + inds = [0, 4, 6] + arr_inds = np.array([0, 4, 6]) + + cp = s.copy() + exp = s.copy() + s[inds] = 0 + s.ix[inds] = 0 + assert_series_equal(cp, exp) + + cp = s.copy() + exp = s.copy() + s[arr_inds] = 0 + s.ix[arr_inds] = 0 + assert_series_equal(cp, exp) + + inds_notfound = [0, 4, 5, 6] + arr_inds_notfound = np.array([0, 4, 5, 6]) + self.assertRaises(Exception, s.__setitem__, inds_notfound, 0) + self.assertRaises(Exception, s.__setitem__, arr_inds_notfound, 0) + + def test_ix_getitem(self): + inds = self.series.index[[3,4,7]] + assert_series_equal(self.series.ix[inds], self.series.reindex(inds)) + assert_series_equal(self.series.ix[5::2], self.series[5::2]) + + # slice with indices + d1, d2 = self.ts.index[[5, 15]] + result = self.ts.ix[d1:d2] + expected = self.ts.truncate(d1, d2) + assert_series_equal(result, expected) + + # boolean + mask = self.series > self.series.median() + assert_series_equal(self.series.ix[mask], self.series[mask]) + + # ask for index value + self.assertEquals(self.ts.ix[d1], self.ts[d1]) + self.assertEquals(self.ts.ix[d2], self.ts[d2]) + + def test_ix_getitem_not_monotonic(self): + d1, d2 = self.ts.index[[5, 15]] + + ts2 = self.ts[::2][::-1] + + self.assertRaises(KeyError, ts2.ix.__getitem__, slice(d1, d2)) + self.assertRaises(KeyError, ts2.ix.__setitem__, slice(d1, d2), 0) + + def test_ix_getitem_setitem_integer_slice_keyerrors(self): + s = Series(np.random.randn(10), index=range(0, 20, 2)) + + # this is OK + cp = s.copy() + cp.ix[4:10] = 0 + self.assert_((cp.ix[4:10] == 0).all()) + + # so is this + cp = s.copy() + cp.ix[3:11] = 0 + self.assert_((cp.ix[3:11] == 0).values.all()) + + result = s.ix[4:10] + result2 = s.ix[3:11] + expected = s.reindex([4, 6, 8, 10]) + + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # non-monotonic, raise KeyError + s2 = s[::-1] + self.assertRaises(KeyError, s2.ix.__getitem__, slice(3, 11)) + self.assertRaises(KeyError, s2.ix.__setitem__, slice(3, 11), 0) + + def test_ix_getitem_iterator(self): + idx = iter(self.series.index[:10]) + result = self.series.ix[idx] + assert_series_equal(result, self.series[:10]) + + def test_ix_setitem(self): + inds = self.series.index[[3,4,7]] + + result = self.series.copy() + result.ix[inds] = 5 + + expected = self.series.copy() + expected[[3,4,7]] = 5 + assert_series_equal(result, expected) + + result.ix[5:10] = 10 + expected[5:10] = 10 + assert_series_equal(result, expected) + + # set slice with indices + d1, d2 = self.series.index[[5, 15]] + result.ix[d1:d2] = 6 + expected[5:16] = 6 # because it's inclusive + assert_series_equal(result, expected) + + # set index value + self.series.ix[d1] = 4 + self.series.ix[d2] = 6 + self.assertEquals(self.series[d1], 4) + self.assertEquals(self.series[d2], 6) + + def test_ix_setitem_boolean(self): + mask = self.series > self.series.median() + + result = self.series.copy() + result.ix[mask] = 0 + expected = self.series + expected[mask] = 0 + assert_series_equal(result, expected) + + def test_ix_setitem_corner(self): + inds = list(self.series.index[[5, 8, 12]]) + self.series.ix[inds] = 5 + self.assertRaises(Exception, self.series.ix.__setitem__, + inds + ['foo'], 5) + + def test_get_set_boolean_different_order(self): + ordered = self.series.order() + + # setting + copy = self.series.copy() + copy[ordered > 0] = 0 + + expected = self.series.copy() + expected[expected > 0] = 0 + + assert_series_equal(copy, expected) + + # getting + sel = self.series[ordered > 0] + exp = self.series[self.series > 0] + assert_series_equal(sel, exp) + + def test_repr(self): + str(self.ts) + str(self.series) + str(self.series.astype(int)) + str(self.objSeries) + + str(Series(tm.randn(1000), index=np.arange(1000))) + str(Series(tm.randn(1000), index=np.arange(1000, 0, step=-1))) + + # empty + str(self.empty) + + # with NaNs + self.series[5:7] = np.NaN + str(self.series) + + # with Nones + ots = self.ts.astype('O') + ots[::2] = None + repr(ots) + + # tuple name, e.g. from hierarchical index + self.series.name = ('foo', 'bar', 'baz') + repr(self.series) + + biggie = Series(tm.randn(1000), index=np.arange(1000), + name=('foo', 'bar', 'baz')) + repr(biggie) + + def test_timeseries_repr_object_dtype(self): + index = Index([datetime(2000, 1, 1) + timedelta(i) + for i in range(1000)], dtype=object) + ts = Series(np.random.randn(len(index)), index) + repr(ts) + + ts = tm.makeTimeSeries(1000) + self.assert_(repr(ts).splitlines()[-1].startswith('Freq:')) + + ts2 = ts.ix[np.random.randint(0, len(ts)-1, 400)] + repr(ts).splitlines()[-1] + + def test_iter(self): + for i, val in enumerate(self.series): + self.assertEqual(val, self.series[i]) + + for i, val in enumerate(self.ts): + self.assertEqual(val, self.ts[i]) + + def test_keys(self): + # HACK: By doing this in two stages, we avoid 2to3 wrapping the call + # to .keys() in a list() + getkeys = self.ts.keys + self.assert_(getkeys() is self.ts.index) + + def test_values(self): + self.assert_(np.array_equal(self.ts, self.ts.values)) + + def test_iteritems(self): + for idx, val in self.series.iteritems(): + self.assertEqual(val, self.series[idx]) + + for idx, val in self.ts.iteritems(): + self.assertEqual(val, self.ts[idx]) + + def test_sum(self): + self._check_stat_op('sum', np.sum) + + def test_sum_inf(self): + s = Series(np.random.randn(10)) + s2 = s.copy() + s[5:8] = np.inf + s2[5:8] = np.nan + assert_almost_equal(s.sum(), s2.sum()) + + import pandas.core.nanops as nanops + arr = np.random.randn(100, 100).astype('f4') + arr[:, 2] = np.inf + res = nanops.nansum(arr, axis=1) + expected = nanops._nansum(arr, axis=1) + assert_almost_equal(res, expected) + + def test_mean(self): + self._check_stat_op('mean', np.mean) + + def test_median(self): + self._check_stat_op('median', np.median) + + # test with integers, test failure + int_ts = TimeSeries(np.ones(10, dtype=int), index=range(10)) + self.assertAlmostEqual(np.median(int_ts), int_ts.median()) + + def test_prod(self): + self._check_stat_op('prod', np.prod) + + def test_min(self): + self._check_stat_op('min', np.min, check_objects=True) + + def test_max(self): + self._check_stat_op('max', np.max, check_objects=True) + + def test_var_std(self): + alt = lambda x: np.std(x, ddof=1) + self._check_stat_op('std', alt) + + alt = lambda x: np.var(x, ddof=1) + self._check_stat_op('var', alt) + + result = self.ts.std(ddof=4) + expected = np.std(self.ts.values, ddof=4) + assert_almost_equal(result, expected) + + result = self.ts.var(ddof=4) + expected = np.var(self.ts.values, ddof=4) + assert_almost_equal(result, expected) + + def test_skew(self): + _skip_if_no_scipy() + + from scipy.stats import skew + alt =lambda x: skew(x, bias=False) + self._check_stat_op('skew', alt) + + def test_kurt(self): + _skip_if_no_scipy() + + from scipy.stats import kurtosis + alt = lambda x: kurtosis(x, bias=False) + self._check_stat_op('kurt', alt) + + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + s = Series(np.random.randn(6), index=index) + self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar']) + + def test_argsort(self): + self._check_accum_op('argsort') + argsorted = self.ts.argsort() + self.assert_(issubclass(argsorted.dtype.type, np.integer)) + + def test_argsort_stable(self): + s = Series(np.random.randint(0, 100, size=10000)) + mindexer = s.argsort(kind='mergesort') + qindexer = s.argsort() + + mexpected = np.argsort(s.values, kind='mergesort') + qexpected = np.argsort(s.values, kind='quicksort') + + self.assert_(np.array_equal(mindexer, mexpected)) + self.assert_(np.array_equal(qindexer, qexpected)) + self.assert_(not np.array_equal(qindexer, mindexer)) + + def test_cumsum(self): + self._check_accum_op('cumsum') + + def test_cumprod(self): + self._check_accum_op('cumprod') + + def test_cummin(self): + self.assert_(np.array_equal(self.ts.cummin(), + np.minimum.accumulate(np.array(self.ts)))) + ts = self.ts.copy() + ts[::2] = np.NaN + result = ts.cummin()[1::2] + expected = np.minimum.accumulate(ts.valid()) + + self.assert_(np.array_equal(result, expected)) + + def test_cummax(self): + self.assert_(np.array_equal(self.ts.cummax(), + np.maximum.accumulate(np.array(self.ts)))) + ts = self.ts.copy() + ts[::2] = np.NaN + result = ts.cummax()[1::2] + expected = np.maximum.accumulate(ts.valid()) + + self.assert_(np.array_equal(result, expected)) + + def test_npdiff(self): + s = Series(np.arange(5)) + r = np.diff(s) + assert_series_equal(Series([nan, 0, 0, 0, nan]), r) + + def _check_stat_op(self, name, alternate, check_objects=False): + import pandas.core.nanops as nanops + + def testit(): + f = getattr(Series, name) + + # add some NaNs + self.series[5:15] = np.NaN + + # skipna or no + self.assert_(notnull(f(self.series))) + self.assert_(isnull(f(self.series, skipna=False))) + + # check the result is correct + nona = self.series.dropna() + assert_almost_equal(f(nona), alternate(nona)) + + allna = self.series * nan + self.assert_(np.isnan(f(allna))) + + # dtype=object with None, it works! + s = Series([1, 2, 3, None, 5]) + f(s) + + # check date range + if check_objects: + s = Series(bdate_range('1/1/2000', periods=10)) + res = f(s) + exp = alternate(s) + self.assertEqual(res, exp) + + testit() + + try: + import bottleneck as bn + nanops._USE_BOTTLENECK = False + testit() + nanops._USE_BOTTLENECK = True + except ImportError: + pass + + def _check_accum_op(self, name): + func = getattr(np, name) + self.assert_(np.array_equal(func(self.ts), func(np.array(self.ts)))) + + # with missing values + ts = self.ts.copy() + ts[::2] = np.NaN + + result = func(ts)[1::2] + expected = func(np.array(ts.valid())) + + self.assert_(np.array_equal(result, expected)) + + def test_round(self): + # numpy.round doesn't preserve metadata, probably a numpy bug, + # re: GH #314 + result = np.round(self.ts, 2) + expected = Series(np.round(self.ts.values, 2), index=self.ts.index) + assert_series_equal(result, expected) + self.assertEqual(result.name, self.ts.name) + + def test_prod_numpy16_bug(self): + s = Series([1., 1., 1.] , index=range(3)) + result = s.prod() + self.assert_(not isinstance(result, Series)) + + def test_quantile(self): + from pandas.compat.scipy import scoreatpercentile + + q = self.ts.quantile(0.1) + self.assertEqual(q, scoreatpercentile(self.ts.valid(), 10)) + + q = self.ts.quantile(0.9) + self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + + def test_describe(self): + _ = self.series.describe() + _ = self.ts.describe() + + def test_describe_percentiles(self): + desc = self.series.describe(percentile_width=50) + assert '75%' in desc.index + assert '25%' in desc.index + + desc = self.series.describe(percentile_width=95) + assert '97.5%' in desc.index + assert '2.5%' in desc.index + + def test_describe_objects(self): + s = Series(['a', 'b', 'b', np.nan, np.nan, np.nan, 'c', 'd', 'a', 'a']) + result = s.describe() + expected = Series({'count' : 7, 'unique' : 4, + 'top' : 'a', 'freq' : 3}, index=result.index) + assert_series_equal(result, expected) + + dt = list(self.ts.index) + dt.append(dt[0]) + ser = Series(dt) + rs = ser.describe() + min_date = min(dt) + max_date = max(dt) + xp = Series({'count' : len(dt), + 'unique' : len(self.ts.index), + 'first' : min_date, 'last' : max_date, 'freq' : 2, + 'top' : min_date}, index=rs.index) + assert_series_equal(rs, xp) + + def test_append(self): + appendedSeries = self.series.append(self.objSeries) + for idx, value in appendedSeries.iteritems(): + if idx in self.series.index: + self.assertEqual(value, self.series[idx]) + elif idx in self.objSeries.index: + self.assertEqual(value, self.objSeries[idx]) + else: + self.fail("orphaned index!") + + self.assertRaises(Exception, self.ts.append, self.ts, + verify_integrity=True) + + def test_append_many(self): + pieces = [self.ts[:5], self.ts[5:10], self.ts[10:]] + + result = pieces[0].append(pieces[1:]) + assert_series_equal(result, self.ts) + + def test_all_any(self): + np.random.seed(12345) + ts = tm.makeTimeSeries() + bool_series = ts > 0 + self.assert_(not bool_series.all()) + self.assert_(bool_series.any()) + + def test_operators(self): + + def _check_op(series, other, op, pos_only=False): + left = np.abs(series) if pos_only else series + right = np.abs(other) if pos_only else other + + cython_or_numpy = op(left, right) + python = left.combine(right, op) + tm.assert_almost_equal(cython_or_numpy, python) + + def check(series, other): + simple_ops = ['add', 'sub', 'mul', 'truediv', 'floordiv'] + + for opname in simple_ops: + _check_op(series, other, getattr(operator, opname)) + + _check_op(series, other, operator.pow, pos_only=True) + + _check_op(series, other, lambda x, y: operator.add(y, x)) + _check_op(series, other, lambda x, y: operator.sub(y, x)) + _check_op(series, other, lambda x, y: operator.truediv(y, x)) + _check_op(series, other, lambda x, y: operator.floordiv(y, x)) + _check_op(series, other, lambda x, y: operator.mul(y, x)) + _check_op(series, other, lambda x, y: operator.pow(y, x), + pos_only=True) + + check(self.ts, self.ts * 2) + check(self.ts, self.ts * 0) + check(self.ts, self.ts[::2]) + check(self.ts, 5) + + def check_comparators(series, other): + _check_op(series, other, operator.gt) + _check_op(series, other, operator.ge) + _check_op(series, other, operator.eq) + _check_op(series, other, operator.lt) + _check_op(series, other, operator.le) + + check_comparators(self.ts, 5) + check_comparators(self.ts, self.ts + 1) + + def test_operators_empty_int_corner(self): + s1 = Series([], [], dtype=np.int32) + s2 = Series({'x' : 0.}) + + # it works! + _ = s1 * s2 + + # NumPy limitiation =( + + # def test_logical_range_select(self): + # np.random.seed(12345) + # selector = -0.5 <= self.ts <= 0.5 + # expected = (self.ts >= -0.5) & (self.ts <= 0.5) + # assert_series_equal(selector, expected) + + def test_operators_na_handling(self): + from decimal import Decimal + from datetime import date + s = Series([Decimal('1.3'), Decimal('2.3')], + index=[date(2012,1,1), date(2012,1,2)]) + + result = s + s.shift(1) + self.assert_(isnull(result[0])) + + s = Series(['foo', 'bar', 'baz', np.nan]) + result = 'prefix_' + s + expected = Series(['prefix_foo', 'prefix_bar', 'prefix_baz', np.nan]) + assert_series_equal(result, expected) + + result = s + '_suffix' + expected = Series(['foo_suffix', 'bar_suffix', 'baz_suffix', np.nan]) + assert_series_equal(result, expected) + + def test_object_comparisons(self): + s = Series(['a', 'b', np.nan, 'c', 'a']) + + result = s == 'a' + expected = Series([True, False, False, False, True]) + assert_series_equal(result, expected) + + result = s < 'a' + expected = Series([False, False, False, False, False]) + assert_series_equal(result, expected) + + result = s != 'a' + expected = -(s == 'a') + assert_series_equal(result, expected) + + def test_comparison_operators_with_nas(self): + s = Series(bdate_range('1/1/2000', periods=10), dtype=object) + s[::2] = np.nan + + # test that comparions work + ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + for op in ops: + val = s[5] + + f = getattr(operator, op) + result = f(s, val) + + expected = f(s.dropna(), val).reindex(s.index) + + if op == 'ne': + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) + + assert_series_equal(result, expected) + + # fffffffuuuuuuuuuuuu + # result = f(val, s) + # expected = f(val, s.dropna()).reindex(s.index) + # assert_series_equal(result, expected) + + # boolean &, |, ^ should work with object arrays and propagate NAs + + ops = ['and_', 'or_', 'xor'] + mask = s.isnull() + for bool_op in ops: + f = getattr(operator, bool_op) + + filled = s.fillna(s[0]) + + result = f(s < s[9], s > s[3]) + + expected = f(filled < filled[9], filled > filled[3]) + expected[mask] = False + assert_series_equal(result, expected) + + def test_comparison_object_numeric_nas(self): + s = Series(np.random.randn(10), dtype=object) + shifted = s.shift(2) + + ops = ['lt', 'le', 'gt', 'ge', 'eq', 'ne'] + for op in ops: + f = getattr(operator, op) + + result = f(s, shifted) + expected = f(s.astype(float), shifted.astype(float)) + assert_series_equal(result, expected) + + def test_more_na_comparisons(self): + left = Series(['a', np.nan, 'c']) + right = Series(['a', np.nan, 'd']) + + result = left == right + expected = Series([True, False, False]) + assert_series_equal(result, expected) + + result = left != right + expected = Series([False, True, True]) + assert_series_equal(result, expected) + + result = left == np.nan + expected = Series([False, False, False]) + assert_series_equal(result, expected) + + result = left != np.nan + expected = Series([True, True, True]) + assert_series_equal(result, expected) + + def test_comparison_different_length(self): + a = Series(['a', 'b', 'c']) + b = Series(['b', 'a']) + self.assertRaises(ValueError, a.__lt__, b) + + def test_between(self): + s = Series(bdate_range('1/1/2000', periods=20).asobject) + s[::2] = np.nan + + result = s[s.between(s[3], s[17])] + expected = s[3:18].dropna() + assert_series_equal(result, expected) + + result = s[s.between(s[3], s[17], inclusive=False)] + expected = s[5:16].dropna() + assert_series_equal(result, expected) + + def test_setitem_na_exception(self): + def testme1(): + s = Series([2,3,4,5,6,7,8,9,10]) + s[::2] = np.nan + + def testme2(): + s = Series([True, True, False, False]) + s[::2] = np.nan + + def testme3(): + s = Series(np.arange(10)) + s[:5] = np.nan + + self.assertRaises(Exception, testme1) + self.assertRaises(Exception, testme2) + self.assertRaises(Exception, testme3) + + def test_scalar_na_cmp_corners(self): + s = Series([2,3,4,5,6,7,8,9,10]) + + def tester(a, b): + return a & b + + self.assertRaises(ValueError, tester, s, datetime(2005,1,1)) + + s = Series([2,3,4,5,6,7,8,9,datetime(2005,1,1)]) + s[::2] = np.nan + + assert_series_equal(tester(s, list(s)), s) + + d = DataFrame({'A':s}) + self.assertRaises(TypeError, tester, s, d) + + def test_idxmin(self): + # test idxmin + # _check_stat_op approach can not be used here because of isnull check. + + # add some NaNs + self.series[5:15] = np.NaN + + # skipna or no + self.assertEqual(self.series[self.series.idxmin()], self.series.min()) + self.assert_(isnull(self.series.idxmin(skipna=False))) + + # no NaNs + nona = self.series.dropna() + self.assertEqual(nona[nona.idxmin()], nona.min()) + self.assertEqual(nona.index.values.tolist().index(nona.idxmin()), + nona.values.argmin()) + + # all NaNs + allna = self.series * nan + self.assert_(isnull(allna.idxmin())) + + def test_idxmax(self): + # test idxmax + # _check_stat_op approach can not be used here because of isnull check. + + # add some NaNs + self.series[5:15] = np.NaN + + # skipna or no + self.assertEqual(self.series[self.series.idxmax()], self.series.max()) + self.assert_(isnull(self.series.idxmax(skipna=False))) + + # no NaNs + nona = self.series.dropna() + self.assertEqual(nona[nona.idxmax()], nona.max()) + self.assertEqual(nona.index.values.tolist().index(nona.idxmax()), + nona.values.argmax()) + + # all NaNs + allna = self.series * nan + self.assert_(isnull(allna.idxmax())) + + def test_operators_date(self): + result = self.objSeries + timedelta(1) + result = self.objSeries - timedelta(1) + + def test_operators_corner(self): + series = self.ts + + empty = Series([], index=Index([])) + + result = series + empty + self.assert_(np.isnan(result).all()) + + result = empty + Series([], index=Index([])) + self.assert_(len(result) == 0) + + # TODO: this returned NotImplemented earlier, what to do? + # deltas = Series([timedelta(1)] * 5, index=np.arange(5)) + # sub_deltas = deltas[::2] + # deltas5 = deltas * 5 + # deltas = deltas + sub_deltas + + # float + int + int_ts = self.ts.astype(int)[:-5] + added = self.ts + int_ts + expected = self.ts.values[:-5] + int_ts.values + self.assert_(np.array_equal(added[:-5], expected)) + + def test_operators_reverse_object(self): + # GH 56 + arr = Series(np.random.randn(10), index=np.arange(10), + dtype=object) + + def _check_op(arr, op): + result = op(1., arr) + expected = op(1., arr.astype(float)) + assert_series_equal(result.astype(float), expected) + + _check_op(arr, operator.add) + _check_op(arr, operator.sub) + _check_op(arr, operator.mul) + _check_op(arr, operator.truediv) + _check_op(arr, operator.floordiv) + + def test_series_frame_radd_bug(self): + from pandas.util.testing import rands + import operator + + # GH 353 + vals = Series([rands(5) for _ in xrange(10)]) + result = 'foo_' + vals + expected = vals.map(lambda x: 'foo_' + x) + assert_series_equal(result, expected) + + frame = DataFrame({'vals' : vals}) + result = 'foo_' + frame + expected = DataFrame({'vals' : vals.map(lambda x: 'foo_' + x)}) + tm.assert_frame_equal(result, expected) + + # really raise this time + self.assertRaises(TypeError, operator.add, datetime.now(), self.ts) + + def test_operators_frame(self): + # rpow does not work with DataFrame + df = DataFrame({'A' : self.ts}) + + tm.assert_almost_equal(self.ts + self.ts, (self.ts + df)['A']) + tm.assert_almost_equal(self.ts ** self.ts, (self.ts ** df)['A']) + tm.assert_almost_equal(self.ts < self.ts, (self.ts < df)['A']) + + def test_operators_combine(self): + def _check_fill(meth, op, a, b, fill_value=0): + exp_index = a.index.union(b.index) + a = a.reindex(exp_index) + b = b.reindex(exp_index) + + amask = isnull(a) + bmask = isnull(b) + + exp_values = [] + for i in range(len(exp_index)): + if amask[i]: + if bmask[i]: + exp_values.append(nan) + continue + exp_values.append(op(fill_value, b[i])) + elif bmask[i]: + if amask[i]: + exp_values.append(nan) + continue + exp_values.append(op(a[i], fill_value)) + else: + exp_values.append(op(a[i], b[i])) + + result = meth(a, b, fill_value=fill_value) + expected = Series(exp_values, exp_index) + assert_series_equal(result, expected) + + a = Series([nan, 1., 2., 3., nan], index=np.arange(5)) + b = Series([nan, 1, nan, 3, nan, 4.], index=np.arange(6)) + + ops = [Series.add, Series.sub, Series.mul, Series.div] + equivs = [operator.add, operator.sub, operator.mul] + if py3compat.PY3: + equivs.append(operator.truediv) + else: + equivs.append(operator.div) + fillvals = [0, 0, 1, 1] + + for op, equiv_op, fv in zip(ops, equivs, fillvals): + result = op(a, b) + exp = equiv_op(a, b) + assert_series_equal(result, exp) + _check_fill(op, equiv_op, a, b, fill_value=fv) + + def test_combine_first(self): + values = tm.makeIntIndex(20).values.astype(float) + series = Series(values, index=tm.makeIntIndex(20)) + + series_copy = series * 2 + series_copy[::2] = np.NaN + + # nothing used from the input + combined = series.combine_first(series_copy) + + self.assert_(np.array_equal(combined, series)) + + # Holes filled from input + combined = series_copy.combine_first(series) + self.assert_(np.isfinite(combined).all()) + + self.assert_(np.array_equal(combined[::2], series[::2])) + self.assert_(np.array_equal(combined[1::2], series_copy[1::2])) + + # mixed types + index = tm.makeStringIndex(20) + floats = Series(tm.randn(20), index=index) + strings = Series(tm.makeStringIndex(10), index=index[::2]) + + combined = strings.combine_first(floats) + + tm.assert_dict_equal(strings, combined, compare_keys=False) + tm.assert_dict_equal(floats[1::2], combined, compare_keys=False) + + # corner case + s = Series([1., 2, 3], index=[0, 1, 2]) + result = s.combine_first(Series([], index=[])) + assert_series_equal(s, result) + + def test_update(self): + s = Series([1.5, nan, 3., 4., nan]) + s2 = Series([nan, 3.5, nan, 5.]) + s.update(s2) + + expected = Series([1.5, 3.5, 3., 5., np.nan]) + assert_series_equal(s, expected) + + def test_corr(self): + _skip_if_no_scipy() + + import scipy.stats as stats + + # full overlap + self.assertAlmostEqual(self.ts.corr(self.ts), 1) + + # partial overlap + self.assertAlmostEqual(self.ts[:15].corr(self.ts[5:]), 1) + + # No overlap + self.assert_(np.isnan(self.ts[::2].corr(self.ts[1::2]))) + + # all NA + cp = self.ts[:10].copy() + cp[:] = np.nan + self.assert_(isnull(cp.corr(cp))) + + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + result = A.corr(B) + expected, _ = stats.pearsonr(A, B) + self.assertAlmostEqual(result, expected) + + def test_corr_rank(self): + _skip_if_no_scipy() + + import scipy + import scipy.stats as stats + + # kendall and spearman + A = tm.makeTimeSeries() + B = tm.makeTimeSeries() + A[-5:] = A[:5] + result = A.corr(B, method='kendall') + expected = stats.kendalltau(A, B)[0] + self.assertAlmostEqual(result, expected) + + result = A.corr(B, method='spearman') + expected = stats.spearmanr(A, B)[0] + self.assertAlmostEqual(result, expected) + + # these methods got rewritten in 0.8 + if int(scipy.__version__.split('.')[1]) < 9: + raise nose.SkipTest + + # results from R + A = Series([-0.89926396, 0.94209606, -1.03289164, -0.95445587, + 0.76910310, -0.06430576, -2.09704447, 0.40660407, + -0.89926396, 0.94209606]) + B = Series([-1.01270225, -0.62210117, -1.56895827, 0.59592943, + -0.01680292, 1.17258718, -1.06009347, -0.10222060, + -0.89076239, 0.89372375]) + kexp = 0.4319297 + sexp = 0.5853767 + self.assertAlmostEqual(A.corr(B, method='kendall'), kexp) + self.assertAlmostEqual(A.corr(B, method='spearman'), sexp) + + def test_cov(self): + # full overlap + self.assertAlmostEqual(self.ts.cov(self.ts), self.ts.std()**2) + + # partial overlap + self.assertAlmostEqual(self.ts[:15].cov(self.ts[5:]), self.ts[5:15].std()**2) + + # No overlap + self.assert_(np.isnan(self.ts[::2].cov(self.ts[1::2]))) + + # all NA + cp = self.ts[:10].copy() + cp[:] = np.nan + self.assert_(isnull(cp.cov(cp))) + + def test_copy(self): + ts = self.ts.copy() + + ts[::2] = np.NaN + + # Did not modify original Series + self.assertFalse(np.isnan(self.ts[0])) + + def test_count(self): + self.assertEqual(self.ts.count(), len(self.ts)) + + self.ts[::2] = np.NaN + + self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum()) + + def test_value_counts_nunique(self): + s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) + hist = s.value_counts() + expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) + assert_series_equal(hist, expected) + + self.assertEquals(s.nunique(), 4) + + # handle NA's properly + s[5:7] = np.nan + hist = s.value_counts() + expected = s.dropna().value_counts() + assert_series_equal(hist, expected) + + s = Series({}) + hist = s.value_counts() + expected = Series([]) + assert_series_equal(hist, expected) + + def test_unique(self): + # 714 also, dtype=float + s = Series([1.2345] * 100) + s[::2] = np.nan + result = s.unique() + self.assert_(len(result) == 2) + + s = Series([1.2345] * 100, dtype='f4') + s[::2] = np.nan + result = s.unique() + self.assert_(len(result) == 2) + + # NAs in object arrays #714 + s = Series(['foo'] * 100, dtype='O') + s[::2] = np.nan + result = s.unique() + self.assert_(len(result) == 2) + + # integers + s = Series(np.random.randint(0, 100, size=100)) + result = np.sort(s.unique()) + expected = np.unique(s.values) + self.assert_(np.array_equal(result, expected)) + + s = Series(np.random.randint(0, 100, size=100).astype(np.int32)) + result = np.sort(s.unique()) + expected = np.unique(s.values) + self.assert_(np.array_equal(result, expected)) + + # test string arrays for coverage + strings = np.tile(np.array([tm.rands(10) for _ in xrange(10)]), 10) + result = np.sort(nanops.unique1d(strings)) + expected = np.unique(strings) + self.assert_(np.array_equal(result, expected)) + + # decision about None + + s = Series([1, 2, 3, None, None, None], dtype=object) + result = s.unique() + expected = np.array([1, 2, 3, None], dtype=object) + self.assert_(np.array_equal(result, expected)) + + def test_sort(self): + ts = self.ts.copy() + ts.sort() + + self.assert_(np.array_equal(ts, self.ts.order())) + self.assert_(np.array_equal(ts.index, self.ts.order().index)) + + def test_sort_index(self): + import random + + rindex = list(self.ts.index) + random.shuffle(rindex) + + random_order = self.ts.reindex(rindex) + sorted_series = random_order.sort_index() + assert_series_equal(sorted_series, self.ts) + + + # descending + sorted_series = random_order.sort_index(ascending=False) + assert_series_equal(sorted_series, + self.ts.reindex(self.ts.index[::-1])) + + def test_order(self): + ts = self.ts.copy() + ts[:5] = np.NaN + vals = ts.values + + result = ts.order() + self.assert_(np.isnan(result[-5:]).all()) + self.assert_(np.array_equal(result[:-5], np.sort(vals[5:]))) + + result = ts.order(na_last=False) + self.assert_(np.isnan(result[:5]).all()) + self.assert_(np.array_equal(result[5:], np.sort(vals[5:]))) + + # something object-type + ser = Series(['A', 'B'], [1, 2]) + # no failure + ser.order() + + # ascending=False + ordered = ts.order(ascending=False) + expected = np.sort(ts.valid().values)[::-1] + assert_almost_equal(expected, ordered.valid().values) + ordered = ts.order(ascending=False, na_last=False) + assert_almost_equal(expected, ordered.valid().values) + + def test_rank(self): + from pandas.compat.scipy import rankdata + + self.ts[::2] = np.nan + self.ts[:10][::3] = 4. + + ranks = self.ts.rank() + oranks = self.ts.astype('O').rank() + + assert_series_equal(ranks, oranks) + + mask = np.isnan(self.ts) + filled = self.ts.fillna(np.inf) + + exp = rankdata(filled) + exp[mask] = np.nan + + assert_almost_equal(ranks, exp) + + def test_from_csv(self): + self.ts.to_csv('_foo') + ts = Series.from_csv('_foo') + assert_series_equal(self.ts, ts) + + self.series.to_csv('_foo') + series = Series.from_csv('_foo') + self.assert_(series.name is None) + self.assert_(series.index.name is None) + assert_series_equal(self.series, series) + + outfile = open('_foo', 'w') + outfile.write('1998-01-01|1.0\n1999-01-01|2.0') + outfile.close() + series = Series.from_csv('_foo',sep='|') + checkseries = Series({datetime(1998,1,1): 1.0, datetime(1999,1,1): 2.0}) + assert_series_equal(checkseries, series) + + series = Series.from_csv('_foo',sep='|',parse_dates=False) + checkseries = Series({'1998-01-01': 1.0, '1999-01-01': 2.0}) + assert_series_equal(checkseries, series) + + os.remove('_foo') + + def test_to_csv(self): + self.ts.to_csv('_foo') + + lines = open('_foo', 'U').readlines() + assert(lines[1] != '\n') + + self.ts.to_csv('_foo', index=False) + arr = np.loadtxt('_foo') + assert_almost_equal(arr, self.ts.values) + + os.remove('_foo') + + def test_to_csv_stringio(self): + buf = StringIO() + self.ts.to_csv(buf, index=False) + buf.seek(0) + arr = np.loadtxt(buf) + assert_almost_equal(arr, self.ts.values) + + def test_to_dict(self): + self.assert_(np.array_equal(Series(self.ts.to_dict()), self.ts)) + + def test_clip(self): + val = self.ts.median() + + self.assertEqual(self.ts.clip_lower(val).min(), val) + self.assertEqual(self.ts.clip_upper(val).max(), val) + + self.assertEqual(self.ts.clip(lower=val).min(), val) + self.assertEqual(self.ts.clip(upper=val).max(), val) + + result = self.ts.clip(-0.5, 0.5) + expected = np.clip(self.ts, -0.5, 0.5) + assert_series_equal(result, expected) + self.assert_(isinstance(expected, Series)) + + def test_valid(self): + ts = self.ts.copy() + ts[::2] = np.NaN + + result = ts.valid() + self.assertEqual(len(result), ts.count()) + + tm.assert_dict_equal(result, ts, compare_keys=False) + + def test_isnull(self): + ser = Series([0,5.4,3,nan,-0.001]) + assert_series_equal(ser.isnull(), Series([False,False,False,True,False])) + ser = Series(["hi","",nan]) + assert_series_equal(ser.isnull(), Series([False,False,True])) + + def test_notnull(self): + ser = Series([0,5.4,3,nan,-0.001]) + assert_series_equal(ser.notnull(), Series([True,True,True,False,True])) + ser = Series(["hi","",nan]) + assert_series_equal(ser.notnull(), Series([True,True,False])) + + def test_shift(self): + shifted = self.ts.shift(1) + unshifted = shifted.shift(-1) + + tm.assert_dict_equal(unshifted.valid(), self.ts, compare_keys=False) + + offset = datetools.bday + shifted = self.ts.shift(1, freq=offset) + unshifted = shifted.shift(-1, freq=offset) + + assert_series_equal(unshifted, self.ts) + + unshifted = self.ts.shift(0, freq=offset) + assert_series_equal(unshifted, self.ts) + + shifted = self.ts.shift(1, freq='B') + unshifted = shifted.shift(-1, freq='B') + + assert_series_equal(unshifted, self.ts) + + # corner case + unshifted = self.ts.shift(0) + assert_series_equal(unshifted, self.ts) + + # Shifting with PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.shift(1) + unshifted = shifted.shift(-1) + tm.assert_dict_equal(unshifted.valid(), ps, compare_keys=False) + + shifted2 = ps.shift(1, 'B') + shifted3 = ps.shift(1, datetools.bday) + assert_series_equal(shifted2, shifted3) + assert_series_equal(ps, shifted2.shift(-1, 'B')) + + self.assertRaises(ValueError, ps.shift, freq='D') + + # legacy support + smod._SHOW_WARNINGS = False + shifted4 = ps.shift(1, timeRule='B') + assert_series_equal(shifted2, shifted4) + + shifted5 = ps.shift(1, offset=datetools.bday) + assert_series_equal(shifted5, shifted4) + smod._SHOW_WARNINGS = True + + def test_tshift(self): + # PeriodIndex + ps = tm.makePeriodSeries() + shifted = ps.tshift(1) + unshifted = shifted.tshift(-1) + + assert_series_equal(unshifted, ps) + + shifted2 = ps.tshift(freq='B') + assert_series_equal(shifted, shifted2) + + shifted3 = ps.tshift(freq=datetools.bday) + assert_series_equal(shifted, shifted3) + + self.assertRaises(ValueError, ps.tshift, freq='M') + + # DatetimeIndex + shifted = self.ts.tshift(1) + unshifted = shifted.tshift(-1) + + assert_series_equal(self.ts, unshifted) + + shifted2 = self.ts.tshift(freq=self.ts.index.freq) + assert_series_equal(shifted, shifted2) + + inferred_ts = Series(self.ts.values, Index(np.asarray(self.ts.index))) + shifted = inferred_ts.tshift(1) + unshifted = shifted.tshift(-1) + assert_series_equal(shifted, self.ts.tshift(1)) + assert_series_equal(unshifted, inferred_ts) + + no_freq = self.ts[[0, 5, 7]] + self.assertRaises(ValueError, no_freq.tshift) + + def test_shift_int(self): + ts = self.ts.astype(int) + shifted = ts.shift(1) + expected = ts.astype(float).shift(1) + assert_series_equal(shifted, expected) + + def test_truncate(self): + offset = datetools.bday + + ts = self.ts[::3] + + start, end = self.ts.index[3], self.ts.index[6] + start_missing, end_missing = self.ts.index[2], self.ts.index[7] + + # neither specified + truncated = ts.truncate() + assert_series_equal(truncated, ts) + + # both specified + expected = ts[1:3] + + truncated = ts.truncate(start, end) + assert_series_equal(truncated, expected) + + truncated = ts.truncate(start_missing, end_missing) + assert_series_equal(truncated, expected) + + # start specified + expected = ts[1:] + + truncated = ts.truncate(before=start) + assert_series_equal(truncated, expected) + + truncated = ts.truncate(before=start_missing) + assert_series_equal(truncated, expected) + + # end specified + expected = ts[:3] + + truncated = ts.truncate(after=end) + assert_series_equal(truncated, expected) + + truncated = ts.truncate(after=end_missing) + assert_series_equal(truncated, expected) + + # corner case, empty series returned + truncated = ts.truncate(after=self.ts.index[0] - offset) + assert(len(truncated) == 0) + + truncated = ts.truncate(before=self.ts.index[-1] + offset) + assert(len(truncated) == 0) + + self.assertRaises(Exception, ts.truncate, + before=self.ts.index[-1] + offset, + after=self.ts.index[0] - offset) + + def test_ptp(self): + N = 1000 + arr = np.random.randn(N) + ser = Series(arr) + self.assertEqual(np.ptp(ser), np.ptp(arr)) + + def test_asof(self): + # array or list or dates + N = 50 + rng = date_range('1/1/1990', periods=N, freq='53s') + ts = Series(np.random.randn(N), index=rng) + ts[15:30] = np.nan + dates = date_range('1/1/1990', periods=N * 3, freq='25s') + + result = ts.asof(dates) + self.assert_(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + result = ts.asof(list(dates)) + self.assert_(notnull(result).all()) + lb = ts.index[14] + ub = ts.index[30] + + mask = (result.index >= lb) & (result.index < ub) + rs = result[mask] + self.assert_((rs == ts[lb]).all()) + + val = result[result.index[result.index >= ub][0]] + self.assertEqual(ts[ub], val) + + self.ts[5:10] = np.NaN + self.ts[15:20] = np.NaN + + val1 = self.ts.asof(self.ts.index[7]) + val2 = self.ts.asof(self.ts.index[19]) + + self.assertEqual(val1, self.ts[4]) + self.assertEqual(val2, self.ts[14]) + + # accepts strings + val1 = self.ts.asof(str(self.ts.index[7])) + self.assertEqual(val1, self.ts[4]) + + # in there + self.assertEqual(self.ts.asof(self.ts.index[3]), self.ts[3]) + + # no as of value + d = self.ts.index[0] - datetools.bday + self.assert_(np.isnan(self.ts.asof(d))) + + def test_asof_more(self): + from pandas import date_range + s = Series([nan, nan, 1, 2, nan, nan, 3, 4, 5], + index=date_range('1/1/2000', periods=9)) + + dates = s.index[[4, 5, 6, 2, 1]] + + result = s.asof(dates) + expected = Series([2, 2, 3, 1, np.nan], index=dates) + + assert_series_equal(result, expected) + + s = Series([1.5, 2.5, 1, 2, nan, nan, 3, 4, 5], + index=date_range('1/1/2000', periods=9)) + result = s.asof(s.index[0]) + self.assertEqual(result, s[0]) + + def test_astype_cast_nan_int(self): + df = Series([1.0, 2.0, 3.0, np.nan]) + self.assertRaises(ValueError, df.astype, np.int64) + + def test_map(self): + index, data = tm.getMixedTypeDict() + + source = Series(data['B'], index=data['C']) + target = Series(data['C'][:4], index=data['D'][:4]) + + merged = target.map(source) + + for k, v in merged.iteritems(): + self.assertEqual(v, source[target[k]]) + + # input could be a dict + merged = target.map(source.to_dict()) + + for k, v in merged.iteritems(): + self.assertEqual(v, source[target[k]]) + + # function + result = self.ts.map(lambda x: x * 2) + self.assert_(np.array_equal(result, self.ts * 2)) + + def test_map_int(self): + left = Series({'a' : 1., 'b' : 2., 'c' : 3., 'd' : 4}) + right = Series({1 : 11, 2 : 22, 3 : 33}) + + self.assert_(left.dtype == np.float_) + self.assert_(issubclass(right.dtype.type, np.integer)) + + merged = left.map(right) + self.assert_(merged.dtype == np.float_) + self.assert_(isnull(merged['d'])) + self.assert_(not isnull(merged['c'])) + + def test_map_type_inference(self): + s = Series(range(3)) + s2 = s.map(lambda x: np.where(x == 0, 0, 1)) + self.assert_(issubclass(s2.dtype.type, np.integer)) + + def test_map_decimal(self): + from decimal import Decimal + + result = self.series.map(lambda x: Decimal(str(x))) + self.assert_(result.dtype == np.object_) + self.assert_(isinstance(result[0], Decimal)) + + def test_apply(self): + assert_series_equal(self.ts.apply(np.sqrt), np.sqrt(self.ts)) + + # elementwise-apply + import math + assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts)) + + # does not return Series + result = self.ts.apply(lambda x: x.values * 2) + assert_series_equal(result, self.ts * 2) + + def test_apply_same_length_inference_bug(self): + s = Series([1, 2]) + f = lambda x: (x, x + 1) + + result = s.apply(f) + expected = s.map(f) + assert_series_equal(result, expected) + + s = Series([1, 2, 3]) + result = s.apply(f) + expected = s.map(f) + assert_series_equal(result, expected) + + def test_apply_dont_convert_dtype(self): + s = Series(np.random.randn(10)) + + f = lambda x: x if x > 0 else np.nan + result = s.apply(f, convert_dtype=False) + self.assert_(result.dtype == object) + + def test_align(self): + def _check_align(a, b, how='left', fill=None): + aa, ab = a.align(b, join=how, fill_value=fill) + + join_index = a.index.join(b.index, how=how) + if fill is not None: + diff_a = aa.index.diff(join_index) + diff_b = ab.index.diff(join_index) + if len(diff_a) > 0: + self.assert_((aa.reindex(diff_a) == fill).all()) + if len(diff_b) > 0: + self.assert_((ab.reindex(diff_b) == fill).all()) + + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + if fill is not None: + ea = ea.fillna(fill) + eb = eb.fillna(fill) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + + for kind in JOIN_TYPES: + _check_align(self.ts[2:], self.ts[:-5], how=kind) + _check_align(self.ts[2:], self.ts[:-5], how=kind, fill=-1) + + # empty left + _check_align(self.ts[:0], self.ts[:-5], how=kind) + + # empty right + _check_align(self.ts[:-5], self.ts[:0], how=kind) + + # both empty + _check_align(self.ts[:0], self.ts[:0], how=kind) + + def test_align_fill_method(self): + def _check_align(a, b, how='left', method='pad', limit=None): + aa, ab = a.align(b, join=how, method=method, limit=limit) + + join_index = a.index.join(b.index, how=how) + ea = a.reindex(join_index) + eb = b.reindex(join_index) + + ea = ea.fillna(method=method, limit=limit) + eb = eb.fillna(method=method, limit=limit) + + assert_series_equal(aa, ea) + assert_series_equal(ab, eb) + + for kind in JOIN_TYPES: + for meth in ['pad', 'bfill']: + _check_align(self.ts[2:], self.ts[:-5], how=kind, method=meth) + _check_align(self.ts[2:], self.ts[:-5], how=kind, + method=meth, limit=1) + + # empty left + _check_align(self.ts[:0], self.ts[:-5], how=kind, method=meth) + _check_align(self.ts[:0], self.ts[:-5], how=kind, method=meth, + limit=1) + + # empty right + _check_align(self.ts[:-5], self.ts[:0], how=kind, method=meth) + _check_align(self.ts[:-5], self.ts[:0], how=kind, method=meth, + limit=1) + + # both empty + _check_align(self.ts[:0], self.ts[:0], how=kind, method=meth) + _check_align(self.ts[:0], self.ts[:0], how=kind, method=meth, + limit=1) + + def test_align_nocopy(self): + b = self.ts[:5].copy() + + # do copy + a = self.ts.copy() + ra, _ = a.align(b, join='left') + ra[:5] = 5 + self.assert_(not (a[:5] == 5).any()) + + # do not copy + a = self.ts.copy() + ra, _ = a.align(b, join='left', copy=False) + ra[:5] = 5 + self.assert_((a[:5] == 5).all()) + + # do copy + a = self.ts.copy() + b = self.ts[:5].copy() + _, rb = a.align(b, join='right') + rb[:3] = 5 + self.assert_(not (b[:3] == 5).any()) + + # do not copy + a = self.ts.copy() + b = self.ts[:5].copy() + _, rb = a.align(b, join='right', copy=False) + rb[:2] = 5 + self.assert_((b[:2] == 5).all()) + + def test_align_sameindex(self): + a, b = self.ts.align(self.ts, copy=False) + self.assert_(a.index is self.ts.index) + self.assert_(b.index is self.ts.index) + + # a, b = self.ts.align(self.ts, copy=True) + # self.assert_(a.index is not self.ts.index) + # self.assert_(b.index is not self.ts.index) + + def test_reindex(self): + identity = self.series.reindex(self.series.index) + self.assertEqual(id(self.series.index), id(identity.index)) + + subIndex = self.series.index[10:20] + subSeries = self.series.reindex(subIndex) + + for idx, val in subSeries.iteritems(): + self.assertEqual(val, self.series[idx]) + + subIndex2 = self.ts.index[10:20] + subTS = self.ts.reindex(subIndex2) + + for idx, val in subTS.iteritems(): + self.assertEqual(val, self.ts[idx]) + stuffSeries = self.ts.reindex(subIndex) + + self.assert_(np.isnan(stuffSeries).all()) + + # This is extremely important for the Cython code to not screw up + nonContigIndex = self.ts.index[::2] + subNonContig = self.ts.reindex(nonContigIndex) + for idx, val in subNonContig.iteritems(): + self.assertEqual(val, self.ts[idx]) + + self.assertRaises(ValueError, self.ts.reindex) + + def test_reindex_corner(self): + # (don't forget to fix this) I think it's fixed + reindexed_dep = self.empty.reindex(self.ts.index, method='pad') + + # corner case: pad empty series + reindexed = self.empty.reindex(self.ts.index, method='pad') + + # pass non-Index + reindexed = self.ts.reindex(list(self.ts.index)) + assert_series_equal(self.ts, reindexed) + + # bad fill method + ts = self.ts[::2] + self.assertRaises(Exception, ts.reindex, self.ts.index, method='foo') + + def test_reindex_pad(self): + s = Series(np.arange(10), np.arange(10)) + + s2 = s[::2] + + reindexed = s2.reindex(s.index, method='pad') + reindexed2 = s2.reindex(s.index, method='ffill') + assert_series_equal(reindexed, reindexed2) + + # used platform int above, need to pass int explicitly here per #1219 + expected = Series([0, 0, 2, 2, 4, 4, 6, 6, 8, 8], dtype=int, + index=np.arange(10)) + assert_series_equal(reindexed, expected) + + def test_reindex_backfill(self): + pass + + def test_reindex_int(self): + ts = self.ts[::2] + int_ts = Series(np.zeros(len(ts), dtype=int), index=ts.index) + + # this should work fine + reindexed_int = int_ts.reindex(self.ts.index) + + # if NaNs introduced + self.assert_(reindexed_int.dtype == np.float_) + + # NO NaNs introduced + reindexed_int = int_ts.reindex(int_ts.index[::2]) + self.assert_(reindexed_int.dtype == np.int_) + + def test_reindex_bool(self): + + # A series other than float, int, string, or object + ts = self.ts[::2] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + + # this should work fine + reindexed_bool = bool_ts.reindex(self.ts.index) + + # if NaNs introduced + self.assert_(reindexed_bool.dtype == np.object_) + + # NO NaNs introduced + reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) + self.assert_(reindexed_bool.dtype == np.bool_) + + def test_reindex_bool_pad(self): + # fail + ts = self.ts[5:] + bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) + filled_bool = bool_ts.reindex(self.ts.index, method='pad') + self.assert_(isnull(filled_bool[:5]).all()) + + def test_reindex_like(self): + other = self.ts[::2] + assert_series_equal(self.ts.reindex(other.index), + self.ts.reindex_like(other)) + + def test_reindex_fill_value(self): + #------------------------------------------------------------ + # floats + floats = Series([1., 2., 3.]) + result = floats.reindex([1, 2, 3]) + expected = Series([2., 3., np.nan], index=[1, 2, 3]) + assert_series_equal(result, expected) + + result = floats.reindex([1, 2, 3], fill_value=0) + expected = Series([2., 3., 0], index=[1, 2, 3]) + assert_series_equal(result, expected) + + #------------------------------------------------------------ + # ints + ints = Series([1, 2, 3]) + + result = ints.reindex([1, 2, 3]) + expected = Series([2., 3., np.nan], index=[1, 2, 3]) + assert_series_equal(result, expected) + + # don't upcast + result = ints.reindex([1, 2, 3], fill_value=0) + expected = Series([2, 3, 0], index=[1, 2, 3]) + self.assert_(issubclass(result.dtype.type, np.integer)) + assert_series_equal(result, expected) + + #------------------------------------------------------------ + # objects + objects = Series([1, 2, 3], dtype=object) + + result = objects.reindex([1, 2, 3]) + expected = Series([2, 3, np.nan], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + result = objects.reindex([1, 2, 3], fill_value='foo') + expected = Series([2, 3, 'foo'], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + #------------------------------------------------------------ + # bools + bools = Series([True, False, True]) + + result = bools.reindex([1, 2, 3]) + expected = Series([False, True, np.nan], index=[1, 2, 3], dtype=object) + assert_series_equal(result, expected) + + result = bools.reindex([1, 2, 3], fill_value=False) + expected = Series([False, True, False], index=[1, 2, 3]) + assert_series_equal(result, expected) + + def test_rename(self): + renamer = lambda x: x.strftime('%Y%m%d') + renamed = self.ts.rename(renamer) + self.assertEqual(renamed.index[0], renamer(self.ts.index[0])) + + # dict + rename_dict = dict(zip(self.ts.index, renamed.index)) + renamed2 = self.ts.rename(rename_dict) + assert_series_equal(renamed, renamed2) + + # partial dict + s = Series(np.arange(4), index=['a', 'b', 'c', 'd']) + renamed = s.rename({'b' : 'foo', 'd' : 'bar'}) + self.assert_(np.array_equal(renamed.index, ['a', 'foo', 'c', 'bar'])) + + def test_rename_inplace(self): + renamer = lambda x: x.strftime('%Y%m%d') + expected = renamer(self.ts.index[0]) + self.ts.rename(renamer, inplace=True) + self.assertEqual(self.ts.index[0], expected) + + def test_preserveRefs(self): + seq = self.ts[[5,10,15]] + seq[1] = np.NaN + self.assertFalse(np.isnan(self.ts[10])) + + def test_ne(self): + ts = TimeSeries([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) + expected = [True, True, False, True, True] + self.assert_(tm.equalContents(ts.index != 5, expected)) + self.assert_(tm.equalContents(~(ts.index == 5), expected)) + + def test_pad_nan(self): + x = TimeSeries([np.nan, 1., np.nan, 3., np.nan], + ['z', 'a', 'b', 'c', 'd'], dtype=float) + x.fillna(method='pad', inplace=True) + expected = TimeSeries([np.nan, 1.0, 1.0, 3.0, 3.0], + ['z', 'a', 'b', 'c', 'd'], dtype=float) + assert_series_equal(x[1:], expected[1:]) + self.assert_(np.isnan(x[0]), np.isnan(expected[0])) + + def test_unstack(self): + from numpy import nan + from pandas.util.testing import assert_frame_equal + + index = MultiIndex(levels=[['bar', 'foo'], ['one', 'three', 'two']], + labels=[[1, 1, 0, 0], [0, 1, 0, 2]]) + + s = Series(np.arange(4.), index=index) + unstacked = s.unstack() + + expected = DataFrame([[2., nan, 3.], [0., 1., nan]], + index=['bar', 'foo'], + columns=['one', 'three', 'two']) + + assert_frame_equal(unstacked, expected) + + unstacked = s.unstack(level=0) + assert_frame_equal(unstacked, expected.T) + + index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], + labels=[[0, 0, 0, 0, 0, 0], + [0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + s = Series(np.random.randn(6), index=index) + exp_index = MultiIndex(levels=[['one', 'two', 'three'], [0, 1]], + labels=[[0, 1, 2, 0, 1, 2], + [0, 1, 0, 1, 0, 1]]) + expected = DataFrame({'bar' : s.values}, index=exp_index).sortlevel(0) + unstacked = s.unstack(0) + assert_frame_equal(unstacked, expected) + + def test_head_tail(self): + assert_series_equal(self.series.head(), self.series[:5]) + assert_series_equal(self.series.tail(), self.series[-5:]) + + def test_isin(self): + s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C']) + + result = s.isin(['A', 'C']) + expected = Series([True, False, True, False, False, False, True, True]) + assert_series_equal(result, expected) + +#------------------------------------------------------------------------------- +# TimeSeries-specific + + def test_fillna(self): + ts = Series([0., 1., 2., 3., 4.], index=tm.makeDateIndex(5)) + + self.assert_(np.array_equal(ts, ts.fillna())) + + ts[2] = np.NaN + + self.assert_(np.array_equal(ts.fillna(), [0., 1., 1., 3., 4.])) + self.assert_(np.array_equal(ts.fillna(method='backfill'), + [0., 1., 3., 3., 4.])) + + self.assert_(np.array_equal(ts.fillna(value=5), [0., 1., 5., 3., 4.])) + + def test_fillna_bug(self): + x = Series([nan, 1., nan, 3., nan],['z','a','b','c','d']) + filled = x.fillna(method='ffill') + expected = Series([nan, 1., 1., 3., 3.], x.index) + assert_series_equal(filled, expected) + + filled = x.fillna(method='bfill') + expected = Series([1., 1., 3., 3., nan], x.index) + assert_series_equal(filled, expected) + + def test_fillna_inplace(self): + x = Series([nan, 1., nan, 3., nan],['z','a','b','c','d']) + y = x.copy() + + y2 = y.fillna(value=0, inplace=True) + self.assert_(y is y2) + + expected = x.fillna(value=0) + assert_series_equal(y2, expected) + + def test_fillna_invalid_method(self): + try: + self.ts.fillna(method='ffil') + except ValueError, inst: + self.assert_('ffil' in str(inst)) + + def test_replace(self): + N = 100 + ser = Series(np.random.randn(N)) + ser[0:4] = np.nan + ser[6:10] = 0 + + # replace list with a single value + rs = ser.replace([np.nan], -1, inplace=True) + exp = ser.fillna(-1) + assert_series_equal(rs, exp) + + rs = ser.replace(0., np.nan) + ser[ser == 0.] = np.nan + assert_series_equal(rs, ser) + + ser = Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), + dtype=object) + ser[:5] = np.nan + ser[6:10] = 'foo' + ser[20:30] = 'bar' + + # replace list with a single value + rs = ser.replace([np.nan, 'foo', 'bar'], -1) + + self.assert_((rs[:5] == -1).all()) + self.assert_((rs[6:10] == -1).all()) + self.assert_((rs[20:30] == -1).all()) + self.assert_((isnull(ser[:5])).all()) + + # replace with different values + rs = ser.replace({np.nan : -1, 'foo' : -2, 'bar' : -3}) + + self.assert_((rs[:5] == -1).all()) + self.assert_((rs[6:10] == -2).all()) + self.assert_((rs[20:30] == -3).all()) + self.assert_((isnull(ser[:5])).all()) + + # replace with different values with 2 lists + rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + assert_series_equal(rs, rs2) + + # replace with forward fill not considering np.nan missing + s2 = ser.copy() + s2[5] = np.nan + rs3 = s2.replace(['foo', 'bar']) + self.assert_(isnull(rs3[6])) + + # replace with back fill considering np.nan as missing + rs4 = ser.replace([np.nan, 'foo', 'bar'], method='bfill') + assert_almost_equal(rs4[4], ser[5]) + + # replace inplace + ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + self.assert_((ser[:5] == -1).all()) + self.assert_((ser[6:10] == -1).all()) + self.assert_((ser[20:30] == -1).all()) + + ser = Series([np.nan, 0, np.inf]) + assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + ser = Series([np.nan, 0, 'foo', 'bar', np.inf, None, lib.NaT]) + assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + filled = ser.copy() + filled[4] = 0 + assert_series_equal(ser.replace(np.inf, 0), filled) + + ser = Series(self.ts.index) + assert_series_equal(ser.replace(np.nan, 0), ser.fillna(0)) + + # malformed + self.assertRaises(ValueError, ser.replace, [1,2,3], [np.nan, 0]) + self.assertRaises(ValueError, ser.replace, xrange(1,3), [np.nan, 0]) + + ser = Series([0, 1, 2, 3, 4]) + result = ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0]) + assert_series_equal(result, Series([4, 3, 2, 1, 0])) + + def test_asfreq(self): + ts = Series([0., 1., 2.], index=[datetime(2009, 10, 30), + datetime(2009, 11, 30), + datetime(2009, 12, 31)]) + + daily_ts = ts.asfreq('B') + monthly_ts = daily_ts.asfreq('BM') + self.assert_(np.array_equal(monthly_ts, ts)) + + daily_ts = ts.asfreq('B', method='pad') + monthly_ts = daily_ts.asfreq('BM') + self.assert_(np.array_equal(monthly_ts, ts)) + + daily_ts = ts.asfreq(datetools.bday) + monthly_ts = daily_ts.asfreq(datetools.bmonthEnd) + self.assert_(np.array_equal(monthly_ts, ts)) + + result = ts[:0].asfreq('M') + self.assert_(len(result) == 0) + self.assert_(result is not ts) + + def test_interpolate(self): + ts = Series(np.arange(len(self.ts), dtype=float), self.ts.index) + + ts_copy = ts.copy() + ts_copy[5:10] = np.NaN + + linear_interp = ts_copy.interpolate(method='linear') + self.assert_(np.array_equal(linear_interp, ts)) + + ord_ts = Series([d.toordinal() for d in self.ts.index], + index=self.ts.index).astype(float) + + ord_ts_copy = ord_ts.copy() + ord_ts_copy[5:10] = np.NaN + + time_interp = ord_ts_copy.interpolate(method='time') + self.assert_(np.array_equal(time_interp, ord_ts)) + + # try time interpolation on a non-TimeSeries + self.assertRaises(Exception, self.series.interpolate, method='time') + + def test_interpolate_index_values(self): + s = Series(np.nan, index=np.sort(np.random.rand(30))) + s[::3] = np.random.randn(10) + + vals = s.index.values.astype(float) + + result = s.interpolate(method='values') + + expected = s.copy() + bad = isnull(expected.values) + good = -bad + expected[bad] = np.interp(vals[bad], vals[good], s.values[good]) + + assert_series_equal(result, expected) + + def test_weekday(self): + # Just run the function + weekdays = self.ts.weekday + + def test_diff(self): + # Just run the function + self.ts.diff() + + def test_pct_change(self): + rs = self.ts.pct_change(fill_method=None) + assert_series_equal(rs, self.ts / self.ts.shift(1) - 1) + + rs = self.ts.pct_change(2) + filled = self.ts.fillna(method='pad') + assert_series_equal(rs, filled / filled.shift(2) - 1) + + rs = self.ts.pct_change(fill_method='bfill', limit=1) + filled = self.ts.fillna(method='bfill', limit=1) + assert_series_equal(rs, filled / filled.shift(1) - 1) + + rs = self.ts.pct_change(freq='5D') + filled = self.ts.fillna(method='pad') + assert_series_equal(rs, filled / filled.shift(freq='5D') - 1) + + def test_pct_change_shift_over_nas(self): + s = Series([1., 1.5, np.nan, 2.5, 3.]) + + chg = s.pct_change() + expected = Series([np.nan, 0.5, np.nan, 2.5/1.5 -1, .2]) + assert_series_equal(chg, expected) + + def test_autocorr(self): + # Just run the function + self.ts.autocorr() + + def test_first_last_valid(self): + ts = self.ts.copy() + ts[:5] = np.NaN + + index = ts.first_valid_index() + self.assertEqual(index, ts.index[5]) + + ts[-5:] = np.NaN + index = ts.last_valid_index() + self.assertEqual(index, ts.index[-6]) + + ts[:] = np.nan + self.assert_(ts.last_valid_index() is None) + self.assert_(ts.first_valid_index() is None) + + ser = Series([], index=[]) + self.assert_(ser.last_valid_index() is None) + self.assert_(ser.first_valid_index() is None) + + def test_mpl_compat_hack(self): + result = self.ts[:, np.newaxis] + expected = self.ts.values[:, np.newaxis] + assert_almost_equal(result, expected) + +#------------------------------------------------------------------------------- +# GroupBy + + def test_select(self): + n = len(self.ts) + result = self.ts.select(lambda x: x >= self.ts.index[n // 2]) + expected = self.ts.reindex(self.ts.index[n//2:]) + assert_series_equal(result, expected) + + result = self.ts.select(lambda x: x.weekday() == 2) + expected = self.ts[self.ts.weekday == 2] + assert_series_equal(result, expected) + +#---------------------------------------------------------------------- +# Misc not safe for sparse + + def test_dropna_preserve_name(self): + self.ts[:5] = np.nan + result = self.ts.dropna() + self.assertEquals(result.name, self.ts.name) + + def test_numpy_unique(self): + # it works! + result = np.unique(self.ts) + +class TestSeriesNonUnique(unittest.TestCase): + + def setUp(self): + pass + + def test_basic_indexing(self): + s = Series(np.random.randn(5), index=['a', 'b', 'a', 'a', 'b']) + + self.assertRaises(IndexError, s.__getitem__, 5) + self.assertRaises(IndexError, s.__setitem__, 5, 0) + + self.assertRaises(KeyError, s.__getitem__, 'c') + self.assertRaises(KeyError, s.__setitem__, 'c', 0) + + s = s.sort_index() + + self.assertRaises(IndexError, s.__getitem__, 5) + self.assertRaises(IndexError, s.__setitem__, 5, 0) + + self.assertRaises(KeyError, s.__getitem__, 'c') + self.assertRaises(KeyError, s.__setitem__, 'c', 0) + + def test_int_indexing(self): + s = Series(np.random.randn(6), index=[0, 0, 1, 1, 2, 2]) + + self.assertRaises(KeyError, s.__getitem__, 5) + self.assertRaises(KeyError, s.__setitem__, 5, 0) + + self.assertRaises(KeyError, s.__getitem__, 'c') + self.assertRaises(KeyError, s.__setitem__, 'c', 0) + + # not monotonic + s = Series(np.random.randn(6), index=[2, 2, 0, 0, 1, 1]) + + self.assertRaises(KeyError, s.__getitem__, 5) + self.assertRaises(KeyError, s.__setitem__, 5, 0) + + self.assertRaises(KeyError, s.__getitem__, 'c') + self.assertRaises(KeyError, s.__setitem__, 'c', 0) + + def test_datetime_indexing(self): + from pandas import date_range + + index = date_range('1/1/2000', '1/7/2000') + index = index.repeat(3) + + s = Series(len(index), index=index) + stamp = Timestamp('1/8/2000') + + self.assertRaises(KeyError, s.__getitem__, stamp) + self.assertRaises(KeyError, s.__setitem__, stamp, 0) + + # not monotonic + s = s[::-1] + + self.assertRaises(KeyError, s.__getitem__, stamp) + self.assertRaises(KeyError, s.__setitem__, stamp, 0) + + def test_reset_index(self): + df = tm.makeDataFrame()[:5] + ser = df.stack() + ser.index.names = ['hash', 'category'] + + ser.name = 'value' + df = ser.reset_index() + self.assert_('value' in df) + + df = ser.reset_index(name='value2') + self.assert_('value2' in df) + + def test_timeseries_coercion(self): + idx = tm.makeDateIndex(10000) + ser = Series(np.random.randn(len(idx)), idx.astype(object)) + self.assert_(isinstance(ser, TimeSeries)) + self.assert_(isinstance(ser.index, DatetimeIndex)) + + def test_replace(self): + N = 100 + ser = Series(np.fabs(np.random.randn(N)), tm.makeDateIndex(N), + dtype=object) + ser[:5] = np.nan + ser[6:10] = 'foo' + ser[20:30] = 'bar' + + # replace list with a single value + rs = ser.replace([np.nan, 'foo', 'bar'], -1) + + self.assert_((rs[:5] == -1).all()) + self.assert_((rs[6:10] == -1).all()) + self.assert_((rs[20:30] == -1).all()) + self.assert_((isnull(ser[:5])).all()) + + # replace with different values + rs = ser.replace({np.nan : -1, 'foo' : -2, 'bar' : -3}) + + self.assert_((rs[:5] == -1).all()) + self.assert_((rs[6:10] == -2).all()) + self.assert_((rs[20:30] == -3).all()) + self.assert_((isnull(ser[:5])).all()) + + # replace with different values with 2 lists + rs2 = ser.replace([np.nan, 'foo', 'bar'], [-1, -2, -3]) + assert_series_equal(rs, rs2) + + # replace with forward fill not considering np.nan missing + s2 = ser.copy() + s2[5] = np.nan + rs3 = s2.replace(['foo', 'bar']) + self.assert_(isnull(rs3[6])) + + # replace with back fill considering np.nan as missing + rs4 = ser.replace([np.nan, 'foo', 'bar'], method='bfill') + assert_almost_equal(rs4[4], ser[5]) + + # replace inplace + ser.replace([np.nan, 'foo', 'bar'], -1, inplace=True) + self.assert_((ser[:5] == -1).all()) + self.assert_((ser[6:10] == -1).all()) + self.assert_((ser[20:30] == -1).all()) + def test_repeat(self): + s = Series(np.random.randn(3), index=['a', 'b', 'c']) + + reps = s.repeat(5) + exp = Series(s.values.repeat(5), index=s.index.values.repeat(5)) + assert_series_equal(reps, exp) + + to_rep = [2, 3, 4] + reps = s.repeat(to_rep) + exp = Series(s.values.repeat(to_rep), + index=s.index.values.repeat(to_rep)) + assert_series_equal(reps, exp) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_stats.py b/pandas/tests/test_stats.py new file mode 100644 index 00000000..9d6f1228 --- /dev/null +++ b/pandas/tests/test_stats.py @@ -0,0 +1,118 @@ +import nose +import unittest + +from numpy import nan +import numpy as np + +from pandas import Series, DataFrame + +from pandas.util.compat import product +from pandas.util.testing import (assert_frame_equal, + assert_series_equal, + assert_almost_equal) + +class TestRank(unittest.TestCase): + + s = Series([1, 3, 4, 2, nan, 2, 1, 5, nan, 3]) + df = DataFrame({'A': s, 'B': s}) + + results = { + 'average': np.array([1.5, 5.5, 7.0, 3.5, nan, + 3.5, 1.5, 8.0, nan, 5.5]), + 'min': np.array([1, 5, 7, 3, nan, 3, 1, 8, nan, 5]), + 'max': np.array([2, 6, 7, 4, nan, 4, 2, 8, nan, 6]), + 'first': np.array([1, 5, 7, 3, nan, 4, 2, 8, nan, 6]) + } + + def test_rank_tie_methods(self): + s = self.s + + def _check(s, expected, method='average'): + result = s.rank(method=method) + assert_almost_equal(result, expected) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, dtype in product(results, dtypes): + if (dtype, method) in disabled: + continue + series = s if dtype is None else s.astype(dtype) + _check(series, results[method], method=method) + + def test_rank_descending(self): + dtypes = ['O', 'f8', 'i8'] + + for dtype, method in product(dtypes, self.results): + if 'i' in dtype: + s = self.s.dropna() + df = self.df.dropna() + else: + s = self.s.astype(dtype) + df = self.df.astype(dtype) + + res = s.rank(ascending=False) + expected = (s.max() - s).rank() + assert_series_equal(res, expected) + + res = df.rank(ascending=False) + expected = (df.max() - df).rank() + assert_frame_equal(res, expected) + + if method == 'first' and dtype == 'O': + continue + + expected = (s.max() - s).rank(method=method) + res2 = s.rank(method=method, ascending=False) + assert_series_equal(res2, expected) + + expected = (df.max() - df).rank(method=method) + + if dtype != 'O': + res2 = df.rank(method=method, ascending=False, + numeric_only=True) + assert_frame_equal(res2, expected) + + res3 = df.rank(method=method, ascending=False, + numeric_only=False) + assert_frame_equal(res3, expected) + + def test_rank_2d_tie_methods(self): + s = self.s + df = self.df + + def _check2d(df, expected, method='average', axis=0): + exp_df = DataFrame({'A': expected, 'B': expected}) + + if axis == 1: + df = df.T + exp_df = exp_df.T + + result = df.rank(method=method, axis=axis) + assert_frame_equal(result, exp_df) + + dtypes = [None, object] + disabled = set([(object, 'first')]) + results = self.results + + for method, axis, dtype in product(results, [0, 1], dtypes): + if (dtype, method) in disabled: + continue + frame = df if dtype is None else df.astype(dtype) + _check2d(frame, results[method], method=method, axis=axis) + + def test_rank_int(self): + s = self.s.dropna().astype('i8') + + for method, res in self.results.iteritems(): + result = s.rank(method=method) + expected = Series(res).dropna() + expected.index = result.index + assert_series_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tests/test_tseries.py b/pandas/tests/test_tseries.py new file mode 100644 index 00000000..9061402b --- /dev/null +++ b/pandas/tests/test_tseries.py @@ -0,0 +1,639 @@ +import unittest + +from numpy import nan +import numpy as np +from pandas import Index, isnull +from pandas.util.testing import assert_almost_equal +import pandas.util.testing as common +import pandas.lib as lib +import pandas._algos as algos +from datetime import datetime + +class TestTseriesUtil(unittest.TestCase): + + def test_combineFunc(self): + pass + + def test_reindex(self): + pass + + def test_isnull(self): + pass + + def test_groupby(self): + pass + + def test_groupby_withnull(self): + pass + + def test_backfill(self): + old = Index([1, 5, 10]) + new = Index(range(12)) + + filler = algos.backfill_int64(old, new) + + expect_filler = [0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, -1] + self.assert_(np.array_equal(filler, expect_filler)) + + # corner case + old = Index([1, 4]) + new = Index(range(5, 10)) + filler = algos.backfill_int64(old, new) + + expect_filler = [-1, -1, -1, -1, -1] + self.assert_(np.array_equal(filler, expect_filler)) + + def test_pad(self): + old = Index([1, 5, 10]) + new = Index(range(12)) + + filler = algos.pad_int64(old, new) + + expect_filler = [-1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2] + self.assert_(np.array_equal(filler, expect_filler)) + + # corner case + old = Index([5, 10]) + new = Index(range(5)) + filler = algos.pad_int64(old, new) + expect_filler = [-1, -1, -1, -1, -1] + self.assert_(np.array_equal(filler, expect_filler)) + +def test_left_join_indexer_unique(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([2, 2, 3, 4, 4], dtype=np.int64) + + result = algos.left_join_indexer_unique_int64(b, a) + expected = np.array([1, 1, 2, 3, 3], dtype=np.int64) + assert(np.array_equal(result, expected)) + +def test_left_outer_join_bug(): + left = np.array([0, 1, 0, 1, 1, 2, 3, 1, 0, 2, 1, 2, 0, 1, 1, 2, 3, 2, 3, + 2, 1, 1, 3, 0, 3, 2, 3, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 1, + 3, 0, 0, 1, 0, 3, 1, 0, 1, 0, 1, 1, 0, 2, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 0, 3, 1, 3, 2, 2, 0, 1, 3, 0, 2, 3, 2, 3, 3, + 2, 3, 3, 1, 3, 2, 0, 0, 3, 1, 1, 1, 0, 2, 3, 3, 1, 2, 0, + 3, 1, 2, 0, 2], dtype=np.int64) + + right = np.array([3, 1], dtype=np.int64) + max_groups = 4 + + lidx, ridx = lib.left_outer_join(left, right, max_groups, sort=False) + + exp_lidx = np.arange(len(left)) + exp_ridx = -np.ones(len(left)) + exp_ridx[left == 1] = 1 + exp_ridx[left == 3] = 0 + + assert(np.array_equal(lidx, exp_lidx)) + assert(np.array_equal(ridx, exp_ridx)) + +def test_inner_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = algos.inner_join_indexer_int64(a, b) + + index_exp = np.array([3, 5], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([2, 4]) + bexp = np.array([1, 2]) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = algos.inner_join_indexer_int64(a, b) + assert_almost_equal(index, [5]) + assert_almost_equal(ares, [0]) + assert_almost_equal(bres, [0]) + +def test_outer_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = algos.outer_join_indexer_int64(a, b) + + index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64) + assert_almost_equal(index, index_exp) + + aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.int64) + bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4]) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = algos.outer_join_indexer_int64(a, b) + assert_almost_equal(index, [5]) + assert_almost_equal(ares, [0]) + assert_almost_equal(bres, [0]) + +def test_left_join_indexer(): + a = np.array([1, 2, 3, 4, 5], dtype=np.int64) + b = np.array([0, 3, 5, 7, 9], dtype=np.int64) + + index, ares, bres = algos.left_join_indexer_int64(a, b) + + assert_almost_equal(index, a) + + aexp = np.array([0, 1, 2, 3, 4], dtype=np.int64) + bexp = np.array([-1, -1, 1, -1, 2], dtype=np.int64) + assert_almost_equal(ares, aexp) + assert_almost_equal(bres, bexp) + + a = np.array([5], dtype=np.int64) + b = np.array([5], dtype=np.int64) + + index, ares, bres = algos.left_join_indexer_int64(a, b) + assert_almost_equal(index, [5]) + assert_almost_equal(ares, [0]) + assert_almost_equal(bres, [0]) + +def test_left_join_indexer2(): + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + + res, lidx, ridx = algos.left_join_indexer_int64(idx2, idx) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + +def test_outer_join_indexer2(): + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + + res, lidx, ridx = algos.outer_join_indexer_int64(idx2, idx) + + exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + +def test_inner_join_indexer2(): + idx = Index([1,1,2,5]) + idx2 = Index([1,2,5,7,9]) + + res, lidx, ridx = algos.inner_join_indexer_int64(idx2, idx) + + exp_res = np.array([1, 1, 2, 5], dtype=np.int64) + assert_almost_equal(res, exp_res) + + exp_lidx = np.array([0, 0, 1, 2], dtype=np.int64) + assert_almost_equal(lidx, exp_lidx) + + exp_ridx = np.array([0, 1, 2, 3], dtype=np.int64) + assert_almost_equal(ridx, exp_ridx) + + +def test_is_lexsorted(): + failure = [ + np.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, + 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0]), + np.array([30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, + 15, 14, + 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, + 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, + 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, + 7, 6, 5, 4, 3, 2, 1, 0, 30, 29, 28, 27, 26, 25, 24, 23, 22, + 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, + 4, 3, 2, 1, 0])] + + assert(not lib.is_lexsorted(failure)) + +# def test_get_group_index(): +# a = np.array([0, 1, 2, 0, 2, 1, 0, 0], dtype=np.int64) +# b = np.array([1, 0, 3, 2, 0, 2, 3, 0], dtype=np.int64) +# expected = np.array([1, 4, 11, 2, 8, 6, 3, 0], dtype=np.int64) + +# result = lib.get_group_index([a, b], (3, 4)) + +# assert(np.array_equal(result, expected)) + +def test_groupsort_indexer(): + a = np.random.randint(0, 1000, 100).astype(np.int64) + b = np.random.randint(0, 1000, 100).astype(np.int64) + + result = lib.groupsort_indexer(a, 1000)[0] + + # need to use a stable sort + expected = np.argsort(a, kind='mergesort') + assert(np.array_equal(result, expected)) + + # compare with lexsort + key = a * 1000 + b + result = lib.groupsort_indexer(key, 1000000)[0] + expected = np.lexsort((b, a)) + assert(np.array_equal(result, expected)) + +def test_ensure_platform_int(): + arr = np.arange(100) + + result = algos.ensure_platform_int(arr) + assert(result is arr) + +def test_duplicated_with_nas(): + keys = np.array([0, 1, nan, 0, 2, nan], dtype=object) + + result = lib.duplicated(keys) + expected = [False, False, False, True, False, True] + assert(np.array_equal(result, expected)) + + result = lib.duplicated(keys, take_last=True) + expected = [True, False, True, False, False, False] + assert(np.array_equal(result, expected)) + + keys = np.empty(8, dtype=object) + for i, t in enumerate(zip([0, 0, nan, nan]*2, [0, nan, 0, nan]*2)): + keys[i] = t + + result = lib.duplicated(keys) + falses = [False] * 4 + trues = [True] * 4 + expected = falses + trues + assert(np.array_equal(result, expected)) + + result = lib.duplicated(keys, take_last=True) + expected = trues + falses + assert(np.array_equal(result, expected)) + +def test_maybe_booleans_to_slice(): + arr = np.array([0, 0, 1, 1, 1, 0, 1], dtype=np.uint8) + result = lib.maybe_booleans_to_slice(arr) + assert(result.dtype == np.bool_) + + result = lib.maybe_booleans_to_slice(arr[:0]) + assert(result == slice(0, 0)) + +def test_convert_objects(): + arr = np.array(['a', 'b', nan, nan, 'd', 'e', 'f'], dtype='O') + result = lib.maybe_convert_objects(arr) + assert(result.dtype == np.object_) + +def test_convert_objects_ints(): + # test that we can detect many kinds of integers + dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + + for dtype_str in dtypes: + arr = np.array(list(np.arange(20, dtype=dtype_str)), dtype='O') + assert(arr[0].dtype == np.dtype(dtype_str)) + result = lib.maybe_convert_objects(arr) + assert(issubclass(result.dtype.type, np.integer)) + +def test_convert_objects_complex_number(): + for dtype in np.sctypes['complex']: + arr = np.array(list(1j * np.arange(20, dtype=dtype)), dtype='O') + assert(arr[0].dtype == np.dtype(dtype)) + result = lib.maybe_convert_objects(arr) + assert(issubclass(result.dtype.type, np.complexfloating)) + +def test_rank(): + from pandas.compat.scipy import rankdata + + def _check(arr): + mask = -np.isfinite(arr) + arr = arr.copy() + result = lib.rank_1d_float64(arr) + arr[mask] = np.inf + exp = rankdata(arr) + exp[mask] = nan + assert_almost_equal(result, exp) + + _check(np.array([nan, nan, 5., 5., 5., nan, 1, 2, 3, nan])) + _check(np.array([4., nan, 5., 5., 5., nan, 1, 2, 4., nan])) + +def test_get_reverse_indexer(): + indexer = np.array([-1, -1, 1, 2, 0, -1, 3, 4], dtype=np.int64) + result = lib.get_reverse_indexer(indexer, 5) + expected = np.array([4, 2, 3, 6, 7], dtype=np.int64) + assert(np.array_equal(result, expected)) + +def test_pad_backfill_object_segfault(): + from datetime import datetime + old = np.array([], dtype='O') + new = np.array([datetime(2010, 12, 31)], dtype='O') + + result = algos.pad_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert(np.array_equal(result, expected)) + + result = algos.pad_object(new, old) + expected = np.array([], dtype=np.int64) + assert(np.array_equal(result, expected)) + + result = algos.backfill_object(old, new) + expected = np.array([-1], dtype=np.int64) + assert(np.array_equal(result, expected)) + + result = algos.backfill_object(new, old) + expected = np.array([], dtype=np.int64) + assert(np.array_equal(result, expected)) + +def test_arrmap(): + values = np.array(['foo', 'foo', 'bar', 'bar', 'baz', 'qux'], dtype='O') + result = algos.arrmap_object(values, lambda x: x in ['foo', 'bar']) + assert(result.dtype == np.bool_) + +def test_series_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + labels = np.array([-1, -1, -1, 0, 0, 0, 1, 1, 1, 1], dtype=np.int64) + + grouper = lib.SeriesGrouper(obj, np.mean, labels, 2, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + +def test_series_bin_grouper(): + from pandas import Series + obj = Series(np.random.randn(10)) + dummy = obj[:0] + + bins = np.array([3, 6]) + + grouper = lib.SeriesBinGrouper(obj, np.mean, bins, dummy) + result, counts = grouper.get_result() + + expected = np.array([obj[:3].mean(), obj[3:6].mean(), obj[6:].mean()]) + assert_almost_equal(result, expected) + + exp_counts = np.array([3, 3, 4], dtype=np.int64) + assert_almost_equal(counts, exp_counts) + +class TestBinGroupers(unittest.TestCase): + + def setUp(self): + self.obj = np.random.randn(10, 1) + self.labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2], dtype=np.int64) + self.bins = np.array([3, 6], dtype=np.int64) + + def test_generate_bins(self): + from pandas.core.groupby import generate_bins_generic + values = np.array([1,2,3,4,5,6], dtype=np.int64) + binner = np.array([0,3,6,9], dtype=np.int64) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + bins = func(values, binner, closed='left') + assert((bins == np.array([2, 5, 6])).all()) + + bins = func(values, binner, closed='right') + assert((bins == np.array([3, 6, 6])).all()) + + for func in [lib.generate_bins_dt64, generate_bins_generic]: + values = np.array([1,2,3,4,5,6], dtype=np.int64) + binner = np.array([0,3,6], dtype=np.int64) + + bins = func(values, binner, closed='right') + assert((bins == np.array([3, 6])).all()) + + self.assertRaises(ValueError, generate_bins_generic, values, [], + 'right') + self.assertRaises(ValueError, generate_bins_generic, values[:0], + binner, 'right') + + self.assertRaises(ValueError, generate_bins_generic, + values, [4], 'right') + self.assertRaises(ValueError, generate_bins_generic, + values, [-3, -1], 'right') + + def test_group_bin_functions(self): + funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + + np_funcs = { + 'add': np.sum, + 'mean': np.mean, + 'prod': np.prod, + 'min': np.min, + 'max': np.max, + 'var': lambda x: x.var(ddof=1) if len(x) >=2 else np.nan + } + + for fname in funcs: + args = [getattr(lib, 'group_%s' % fname), + getattr(lib, 'group_%s_bin' % fname), + np_funcs[fname]] + self._check_versions(*args) + + def _check_versions(self, irr_func, bin_func, np_func): + obj = self.obj + + cts = np.zeros(3, dtype=np.int64) + exp = np.zeros((3, 1), np.float64) + irr_func(exp, cts, obj, self.labels) + + # bin-based version + bins = np.array([3, 6], dtype=np.int64) + out = np.zeros((3, 1), np.float64) + counts = np.zeros(len(out), dtype=np.int64) + bin_func(out, counts, obj, bins) + + assert_almost_equal(out, exp) + + bins = np.array([3, 9, 10], dtype=np.int64) + out = np.zeros((3, 1), np.float64) + counts = np.zeros(len(out), dtype=np.int64) + bin_func(out, counts, obj, bins) + exp = np.array([np_func(obj[:3]), np_func(obj[3:9]), + np_func(obj[9:])], + dtype=np.float64) + assert_almost_equal(out.squeeze(), exp) + + # duplicate bins + bins = np.array([3, 6, 10, 10], dtype=np.int64) + out = np.zeros((4, 1), np.float64) + counts = np.zeros(len(out), dtype=np.int64) + bin_func(out, counts, obj, bins) + exp = np.array([np_func(obj[:3]), np_func(obj[3:6]), + np_func(obj[6:10]), np.nan], + dtype=np.float64) + assert_almost_equal(out.squeeze(), exp) + + +def test_group_ohlc(): + obj = np.random.randn(20) + + bins = np.array([6, 12], dtype=np.int64) + out = np.zeros((3, 4), np.float64) + counts = np.zeros(len(out), dtype=np.int64) + + lib.group_ohlc(out, counts, obj[:, None], bins) + + def _ohlc(group): + if isnull(group).all(): + return np.repeat(nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), + _ohlc(obj[12:])]) + + assert_almost_equal(out, expected) + assert_almost_equal(counts, [6, 6, 8]) + + obj[:6] = nan + lib.group_ohlc(out, counts, obj[:, None], bins) + expected[0] = nan + assert_almost_equal(out, expected) + +def test_try_parse_dates(): + from dateutil.parser import parse + + arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object) + + result = lib.try_parse_dates(arr, dayfirst=True) + expected = [parse(d, dayfirst=True) for d in arr] + assert(np.array_equal(result, expected)) + + +class TestTypeInference(unittest.TestCase): + + def test_length_zero(self): + result = lib.infer_dtype(np.array([], dtype='i4')) + self.assertEqual(result, 'empty') + + result = lib.infer_dtype(np.array([], dtype='O')) + self.assertEqual(result, 'empty') + + def test_integers(self): + arr = np.array([1, 2, 3, np.int64(4), np.int32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + arr = np.array([1, 2, 3, np.int64(4), np.int32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='i4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'integer') + + def test_bools(self): + arr = np.array([True, False, True, True, True], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([np.bool_(True), np.bool_(False)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + arr = np.array([True, False, True, 'foo'], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed') + + arr = np.array([True, False, True], dtype=bool) + result = lib.infer_dtype(arr) + self.assertEqual(result, 'boolean') + + def test_floats(self): + arr = np.array([1., 2., 3., np.float64(4), np.float32(5)], dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, np.float64(4), np.float32(5), 'foo'], + dtype='O') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'mixed-integer') + + arr = np.array([1, 2, 3, 4, 5], dtype='f4') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + arr = np.array([1, 2, 3, 4, 5], dtype='f8') + result = lib.infer_dtype(arr) + self.assertEqual(result, 'floating') + + def test_string(self): + pass + + def test_unicode(self): + pass + + def test_datetime(self): + import datetime + dates = [datetime.datetime(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assert_(index.inferred_type == 'datetime64') + + def test_date(self): + import datetime + dates = [datetime.date(2012, 1, x) for x in range(1, 20)] + index = Index(dates) + self.assert_(index.inferred_type == 'date') + + def test_to_object_array_tuples(self): + r = (5,6) + values = [r] + result = lib.to_object_array_tuples(values) + + try: + # make sure record array works + from collections import namedtuple + record = namedtuple('record', 'x y') + r = record(5, 6) + values = [r] + result = lib.to_object_array_tuples(values) + except ImportError: + pass + + +class TestMoments(unittest.TestCase): + pass + + +class TestReducer(unittest.TestCase): + + def test_int_index(self): + from pandas.core.series import Series + + arr = np.random.randn(100, 4) + + result = lib.reduce(arr, np.sum, labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + result = lib.reduce(arr, np.sum, axis=1, labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(100)) + result = lib.reduce(arr, np.sum, dummy=dummy, labels=Index(np.arange(4))) + expected = arr.sum(0) + assert_almost_equal(result, expected) + + dummy = Series(0., index=np.arange(4)) + result = lib.reduce(arr, np.sum, axis=1, + dummy=dummy, labels=Index(np.arange(100))) + expected = arr.sum(1) + assert_almost_equal(result, expected) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tools/__init__.py b/pandas/tools/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tools/describe.py b/pandas/tools/describe.py new file mode 100644 index 00000000..43e3051d --- /dev/null +++ b/pandas/tools/describe.py @@ -0,0 +1,16 @@ +from pandas.core.series import Series + +def value_range(df): + """ + Return the minimum and maximum of a dataframe in a series object + + Parameters + ---------- + df : DataFrame + + Returns + ------- + (maximum, minimum) : Series + + """ + return Series((min(df.min()), max(df.max())), ('Minimum', 'Maximum')) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py new file mode 100644 index 00000000..9fdbbdaa --- /dev/null +++ b/pandas/tools/merge.py @@ -0,0 +1,1212 @@ +""" +SQL-style merge routines +""" + +import numpy as np + +from pandas.core.categorical import Factor +from pandas.core.frame import DataFrame, _merge_doc +from pandas.core.generic import NDFrame +from pandas.core.groupby import get_group_index +from pandas.core.series import Series +from pandas.core.index import (Index, MultiIndex, _get_combined_index, + _ensure_index, _get_consensus_names, + _all_indexes_same) +from pandas.core.internals import (IntBlock, BoolBlock, BlockManager, + DatetimeBlock, make_block, _consolidate) +from pandas.util.decorators import cache_readonly, Appender, Substitution + +from pandas.sparse.frame import SparseDataFrame +import pandas.core.common as com + +import pandas.lib as lib + +@Substitution('\nleft : DataFrame') +@Appender(_merge_doc, indents=0) +def merge(left, right, how='inner', on=None, left_on=None, right_on=None, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True): + op = _MergeOperation(left, right, how=how, on=on, left_on=left_on, + right_on=right_on, left_index=left_index, + right_index=right_index, sort=sort, suffixes=suffixes, + copy=copy) + return op.get_result() +if __debug__: merge.__doc__ = _merge_doc % '\nleft : DataFrame' + + +class MergeError(Exception): + pass + + +def ordered_merge(left, right, on=None, left_by=None, right_by=None, + left_on=None, right_on=None, + fill_method=None, suffixes=('_x', '_y')): + """Perform merge with optional filling/interpolation designed for ordered + data like time series data. Optionally perform group-wise merge (see + examples) + + Parameters + ---------- + left : DataFrame + right : DataFrame + fill_method : {'ffill', None}, default None + Interpolation method for data + on : label or list + Field names to join on. Must be found in both DataFrames. + left_on : label or list, or array-like + Field names to join on in left DataFrame. Can be a vector or list of + vectors of the length of the DataFrame to use a particular vector as + the join key instead of columns + right_on : label or list, or array-like + Field names to join on in right DataFrame or vector/list of vectors per + left_on docs + left_by : column name or list of column names + Group left DataFrame by group columns and merge piece by piece with + right DataFrame + right_by : column name or list of column names + Group right DataFrame by group columns and merge piece by piece with + left DataFrame + suffixes : 2-length sequence (tuple, list, ...) + Suffix to apply to overlapping column names in the left and right + side, respectively + + Examples + -------- + >>> A >>> B + key lvalue group key rvalue + 0 a 1 a 0 b 1 + 1 c 2 a 1 c 2 + 2 e 3 a 2 d 3 + 3 a 1 b + 4 c 2 b + 5 e 3 b + + >>> ordered_merge(A, B, fill_method='ffill', left_by='group') + key lvalue group rvalue + 0 a 1 a NaN + 1 b 1 a 1 + 2 c 2 a 2 + 3 d 2 a 3 + 4 e 3 a 3 + 5 f 3 a 4 + 6 a 1 b NaN + 7 b 1 b 1 + 8 c 2 b 2 + 9 d 2 b 3 + 10 e 3 b 3 + 11 f 3 b 4 + + Returns + ------- + merged : DataFrame + """ + def _merger(x, y): + op = _OrderedMerge(x, y, on=on, left_on=left_on, right_on=right_on, + # left_index=left_index, right_index=right_index, + suffixes=suffixes, fill_method=fill_method) + return op.get_result() + + if left_by is not None and right_by is not None: + raise ValueError('Can only group either left or right frames') + elif left_by is not None: + if not isinstance(left_by, (list, tuple)): + left_by = [left_by] + pieces = [] + for key, xpiece in left.groupby(left_by): + merged = _merger(xpiece, right) + for k in left_by: + # May have passed ndarray + try: + if k in merged: + merged[k] = key + except: + pass + pieces.append(merged) + return concat(pieces, ignore_index=True) + elif right_by is not None: + if not isinstance(right_by, (list, tuple)): + right_by = [right_by] + pieces = [] + for key, ypiece in right.groupby(right_by): + merged = _merger(left, ypiece) + for k in right_by: + try: + if k in merged: + merged[k] = key + except: + pass + pieces.append(merged) + return concat(pieces, ignore_index=True) + else: + return _merger(left, right) + + + +# TODO: NA group handling +# TODO: transformations?? +# TODO: only copy DataFrames when modification necessary + +class _MergeOperation(object): + """ + Perform a database (SQL) merge operation between two DataFrame objects + using either columns as keys or their row indexes + """ + + def __init__(self, left, right, how='inner', on=None, + left_on=None, right_on=None, axis=1, + left_index=False, right_index=False, sort=True, + suffixes=('_x', '_y'), copy=True): + self.left = self.orig_left = left + self.right = self.orig_right = right + self.how = how + self.axis = axis + + self.on = com._maybe_make_list(on) + self.left_on = com._maybe_make_list(left_on) + self.right_on = com._maybe_make_list(right_on) + + self.copy = copy + self.suffixes = suffixes + self.sort = sort + + self.left_index = left_index + self.right_index = right_index + + # note this function has side effects + (self.left_join_keys, + self.right_join_keys, + self.join_names) = self._get_merge_keys() + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + # this is a bit kludgy + ldata, rdata = self._get_merge_data() + + # TODO: more efficiently handle group keys to avoid extra consolidation! + join_op = _BlockJoinOperation([ldata, rdata], join_index, + [left_indexer, right_indexer], axis=1, + copy=self.copy) + + result_data = join_op.get_result() + result = DataFrame(result_data) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + def _maybe_add_join_keys(self, result, left_indexer, right_indexer): + # insert group keys + + keys = zip(self.join_names, self.left_on, self.right_on) + for i, (name, lname, rname) in enumerate(keys): + if not _should_fill(lname, rname): + continue + + if name in result: + key_col = result[name] + + if name in self.left and left_indexer is not None: + na_indexer = (left_indexer == -1).nonzero()[0] + if len(na_indexer) == 0: + continue + + right_na_indexer = right_indexer.take(na_indexer) + key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], + right_na_indexer)) + elif name in self.right and right_indexer is not None: + na_indexer = (right_indexer == -1).nonzero()[0] + if len(na_indexer) == 0: + continue + + left_na_indexer = left_indexer.take(na_indexer) + key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], + left_na_indexer)) + elif left_indexer is not None: + if name is None: + name = 'key_%d' % i + + # a faster way? + key_col = com.take_1d(self.left_join_keys[i], left_indexer) + na_indexer = (left_indexer == -1).nonzero()[0] + right_na_indexer = right_indexer.take(na_indexer) + key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], + right_na_indexer)) + result.insert(i, name, key_col) + + def _get_join_info(self): + left_ax = self.left._data.axes[self.axis] + right_ax = self.right._data.axes[self.axis] + if self.left_index and self.right_index: + join_index, left_indexer, right_indexer = \ + left_ax.join(right_ax, how=self.how, return_indexers=True) + elif self.right_index and self.how == 'left': + join_index, left_indexer, right_indexer = \ + _left_join_on_index(left_ax, right_ax, self.left_join_keys, + sort=self.sort) + + elif self.left_index and self.how == 'right': + join_index, right_indexer, left_indexer = \ + _left_join_on_index(right_ax, left_ax, self.right_join_keys, + sort=self.sort) + else: + (left_indexer, + right_indexer) = _get_join_indexers(self.left_join_keys, + self.right_join_keys, + sort=self.sort, how=self.how) + + if self.right_index: + join_index = self.left.index.take(left_indexer) + elif self.left_index: + join_index = self.right.index.take(right_indexer) + else: + join_index = Index(np.arange(len(left_indexer))) + + return join_index, left_indexer, right_indexer + + def _get_merge_data(self): + """ + Handles overlapping column names etc. + """ + ldata, rdata = self.left._data, self.right._data + lsuf, rsuf = self.suffixes + ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf, + copydata=False) + return ldata, rdata + + def _get_merge_keys(self): + """ + Note: has side effects (copy/delete key columns) + + Parameters + ---------- + left + right + on + + Returns + ------- + left_keys, right_keys + """ + self._validate_specification() + + left_keys = [] + right_keys = [] + join_names = [] + right_drop = [] + left, right = self.left, self.right + + is_lkey = lambda x: isinstance(x, np.ndarray) and len(x) == len(left) + is_rkey = lambda x: isinstance(x, np.ndarray) and len(x) == len(right) + + # ugh, spaghetti re #733 + if _any(self.left_on) and _any(self.right_on): + for lk, rk in zip(self.left_on, self.right_on): + if is_lkey(lk): + left_keys.append(lk) + if is_rkey(rk): + right_keys.append(rk) + join_names.append(None) # what to do? + else: + right_keys.append(right[rk].values) + join_names.append(rk) + else: + if not is_rkey(rk): + right_keys.append(right[rk].values) + if lk == rk: + right_drop.append(rk) + else: + right_keys.append(rk) + left_keys.append(left[lk].values) + join_names.append(lk) + elif _any(self.left_on): + for k in self.left_on: + if is_lkey(k): + left_keys.append(k) + join_names.append(None) + else: + left_keys.append(left[k].values) + join_names.append(k) + if isinstance(self.right.index, MultiIndex): + right_keys = [lev.values.take(lab) + for lev, lab in zip(self.right.index.levels, + self.right.index.labels)] + else: + right_keys = [self.right.index.values] + elif _any(self.right_on): + for k in self.right_on: + if is_rkey(k): + right_keys.append(k) + join_names.append(None) + else: + right_keys.append(right[k].values) + join_names.append(k) + if isinstance(self.left.index, MultiIndex): + left_keys = [lev.values.take(lab) + for lev, lab in zip(self.left.index.levels, + self.left.index.labels)] + else: + left_keys = [self.left.index.values] + + if right_drop: + self.right = self.right.drop(right_drop, axis=1) + + return left_keys, right_keys, join_names + + def _validate_specification(self): + # Hm, any way to make this logic less complicated?? + if (self.on is None and self.left_on is None + and self.right_on is None): + + if self.left_index and self.right_index: + self.left_on, self.right_on = (), () + elif self.left_index: + if self.right_on is None: + raise MergeError('Must pass right_on or right_index=True') + elif self.right_index: + if self.left_on is None: + raise MergeError('Must pass left_on or left_index=True') + else: + # use the common columns + common_cols = self.left.columns.intersection(self.right.columns) + if len(common_cols) == 0: + raise MergeError('No common columns to perform merge on') + self.left_on = self.right_on = common_cols + elif self.on is not None: + if self.left_on is not None or self.right_on is not None: + raise MergeError('Can only pass on OR left_on and ' + 'right_on') + self.left_on = self.right_on = self.on + elif self.left_on is not None: + n = len(self.left_on) + if self.right_index: + assert(len(self.left_on) == self.right.index.nlevels) + self.right_on = [None] * n + elif self.right_on is not None: + n = len(self.right_on) + if self.left_index: + assert(len(self.right_on) == self.left.index.nlevels) + self.left_on = [None] * n + assert(len(self.right_on) == len(self.left_on)) + + +def _get_join_indexers(left_keys, right_keys, sort=False, how='inner'): + """ + + Parameters + ---------- + + Returns + ------- + + """ + assert(len(left_keys) == len(right_keys)) + + left_labels = [] + right_labels = [] + group_sizes = [] + + for lk, rk in zip(left_keys, right_keys): + llab, rlab, count = _factorize_keys(lk, rk, sort=sort) + + left_labels.append(llab) + right_labels.append(rlab) + group_sizes.append(count) + + left_group_key = get_group_index(left_labels, group_sizes) + right_group_key = get_group_index(right_labels, group_sizes) + + max_groups = 1L + for x in group_sizes: + max_groups *= long(x) + + if max_groups > 2**63: # pragma: no cover + raise MergeError('Combinatorial explosion! (boom)') + + left_group_key, right_group_key, max_groups = \ + _factorize_keys(left_group_key, right_group_key, sort=sort) + + join_func = _join_functions[how] + return join_func(left_group_key, right_group_key, max_groups) + + + +class _OrderedMerge(_MergeOperation): + + def __init__(self, left, right, on=None, by=None, left_on=None, + right_on=None, axis=1, left_index=False, right_index=False, + suffixes=('_x', '_y'), copy=True, + fill_method=None): + + self.fill_method = fill_method + + _MergeOperation.__init__(self, left, right, on=on, left_on=left_on, + right_on=right_on, axis=axis, + left_index=left_index, + right_index=right_index, + how='outer', suffixes=suffixes, + sort=True # sorts when factorizing + ) + + + def get_result(self): + join_index, left_indexer, right_indexer = self._get_join_info() + + # this is a bit kludgy + ldata, rdata = self._get_merge_data() + + if self.fill_method == 'ffill': + left_join_indexer = lib.ffill_indexer(left_indexer) + right_join_indexer = lib.ffill_indexer(right_indexer) + else: + left_join_indexer = left_indexer + right_join_indexer = right_indexer + + join_op = _BlockJoinOperation([ldata, rdata], join_index, + [left_join_indexer, right_join_indexer], + axis=1, copy=self.copy) + + result_data = join_op.get_result() + result = DataFrame(result_data) + + self._maybe_add_join_keys(result, left_indexer, right_indexer) + + return result + + +def _get_multiindex_indexer(join_keys, index, sort=False): + shape = [] + labels = [] + for level, key in zip(index.levels, join_keys): + llab, rlab, count = _factorize_keys(level, key, sort=False) + labels.append(rlab) + shape.append(count) + + left_group_key = get_group_index(labels, shape) + right_group_key = get_group_index(index.labels, shape) + + left_group_key, right_group_key, max_groups = \ + _factorize_keys(left_group_key, right_group_key, + sort=False) + + left_indexer, right_indexer = \ + lib.left_outer_join(com._ensure_int64(left_group_key), + com._ensure_int64(right_group_key), + max_groups, sort=False) + + return left_indexer, right_indexer + +def _get_single_indexer(join_key, index, sort=False): + left_key, right_key, count = _factorize_keys(join_key, index, sort=sort) + + left_indexer, right_indexer = \ + lib.left_outer_join(com._ensure_int64(left_key), + com._ensure_int64(right_key), + count, sort=sort) + + return left_indexer, right_indexer + +def _left_join_on_index(left_ax, right_ax, join_keys, sort=False): + join_index = left_ax + left_indexer = None + + if len(join_keys) > 1: + assert(isinstance(right_ax, MultiIndex) and + len(join_keys) == right_ax.nlevels) + + left_tmp, right_indexer = \ + _get_multiindex_indexer(join_keys, right_ax, + sort=sort) + if sort: + left_indexer = left_tmp + join_index = left_ax.take(left_indexer) + else: + jkey = join_keys[0] + if sort: + left_indexer, right_indexer = \ + _get_single_indexer(jkey, right_ax, sort=sort) + join_index = left_ax.take(left_indexer) + else: + right_indexer = right_ax.get_indexer(jkey) + + return join_index, left_indexer, right_indexer + + +def _right_outer_join(x, y, max_groups): + right_indexer, left_indexer = lib.left_outer_join(y, x, max_groups) + return left_indexer, right_indexer + +_join_functions = { + 'inner' : lib.inner_join, + 'left' : lib.left_outer_join, + 'right' : _right_outer_join, + 'outer' : lib.full_outer_join, +} + + +def _factorize_keys(lk, rk, sort=True): + if com.is_integer_dtype(lk) and com.is_integer_dtype(rk): + klass = lib.Int64Factorizer + lk = com._ensure_int64(lk) + rk = com._ensure_int64(rk) + else: + klass = lib.Factorizer + lk = com._ensure_object(lk) + rk = com._ensure_object(rk) + + rizer = klass(max(len(lk), len(rk))) + + llab, _ = rizer.factorize(lk) + rlab, _ = rizer.factorize(rk) + + count = rizer.get_count() + + if sort: + llab, rlab = _sort_labels(rizer.uniques, llab, rlab) + + # TODO: na handling + + return llab, rlab, count + +def _sort_labels(uniques, left, right): + if not isinstance(uniques, np.ndarray): + # tuplesafe + uniques = Index(uniques).values + + sorter = uniques.argsort() + + reverse_indexer = np.empty(len(sorter), dtype=np.int64) + reverse_indexer.put(sorter, np.arange(len(sorter))) + + new_left = reverse_indexer.take(com._ensure_platform_int(left)) + np.putmask(new_left, left == -1, -1) + + new_right = reverse_indexer.take(com._ensure_platform_int(right)) + np.putmask(new_right, right == -1, -1) + + return new_left, new_right + +class _BlockJoinOperation(object): + """ + BlockJoinOperation made generic for N DataFrames + + Object responsible for orchestrating efficient join operation between two + BlockManager data structures + """ + def __init__(self, data_list, join_index, indexers, axis=1, copy=True): + if axis <= 0: # pragma: no cover + raise MergeError('Only axis >= 1 supported for this operation') + + assert(len(data_list) == len(indexers)) + + self.units = [] + for data, indexer in zip(data_list, indexers): + if not data.is_consolidated(): + data = data.consolidate() + self.units.append(_JoinUnit(data.blocks, indexer)) + + self.join_index = join_index + self.axis = axis + self.copy = copy + + # do NOT sort + self.result_items = _concat_indexes([d.items for d in data_list]) + self.result_axes = list(data_list[0].axes) + self.result_axes[0] = self.result_items + self.result_axes[axis] = self.join_index + + def _prepare_blocks(self): + blockmaps = [] + + for unit in self.units: + join_blocks = unit.get_upcasted_blocks() + type_map = dict((type(blk), blk) for blk in join_blocks) + blockmaps.append(type_map) + + return blockmaps + + def get_result(self): + """ + Returns + ------- + merged : BlockManager + """ + blockmaps = self._prepare_blocks() + kinds = _get_all_block_kinds(blockmaps) + + result_blocks = [] + + # maybe want to enable flexible copying <-- what did I mean? + for klass in kinds: + klass_blocks = [mapping.get(klass) for mapping in blockmaps] + res_blk = self._get_merged_block(klass_blocks) + result_blocks.append(res_blk) + + return BlockManager(result_blocks, self.result_axes) + + def _get_merged_block(self, blocks): + + to_merge = [] + + for unit, block in zip(self.units, blocks): + if block is not None: + to_merge.append((unit, block)) + + if len(to_merge) > 1: + return self._merge_blocks(to_merge) + else: + unit, block = to_merge[0] + return unit.reindex_block(block, self.axis, + self.result_items, copy=self.copy) + + def _merge_blocks(self, merge_chunks): + """ + merge_chunks -> [(_JoinUnit, Block)] + """ + funit, fblock = merge_chunks[0] + fidx = funit.indexer + + out_shape = list(fblock.values.shape) + + n = len(fidx) if fidx is not None else out_shape[self.axis] + + out_shape[0] = sum(len(blk) for unit, blk in merge_chunks) + out_shape[self.axis] = n + + # Should use Fortran order?? + out = np.empty(out_shape, dtype=fblock.values.dtype) + + sofar = 0 + for unit, blk in merge_chunks: + out_chunk = out[sofar : sofar + len(blk)] + + if unit.indexer is None: + # is this really faster than assigning to arr.flat? + com.take_fast(blk.values, np.arange(n, dtype=np.int64), + None, False, + axis=self.axis, out=out_chunk) + else: + # write out the values to the result array + com.take_fast(blk.values, unit.indexer, + None, False, + axis=self.axis, out=out_chunk) + + sofar += len(blk) + + # does not sort + new_block_items = _concat_indexes([b.items for _, b in merge_chunks]) + return make_block(out, new_block_items, self.result_items) + + + +class _JoinUnit(object): + """ + Blocks plus indexer + """ + + def __init__(self, blocks, indexer): + self.blocks = blocks + self.indexer = indexer + + @cache_readonly + def mask_info(self): + if self.indexer is None or not _may_need_upcasting(self.blocks): + mask = None + need_masking = False + else: + mask = self.indexer == -1 + need_masking = mask.any() + + return mask, need_masking + + @property + def need_masking(self): + return self.mask_info[1] + + def get_upcasted_blocks(self): + # will short-circuit and not compute lneed_masking if indexer is None + if self.need_masking: + return _upcast_blocks(self.blocks) + return self.blocks + + def reindex_block(self, block, axis, ref_items, copy=True): + # still some inefficiency here for bool/int64 because in the case where + # no masking is needed, take_fast will recompute the mask + + mask, need_masking = self.mask_info + + if self.indexer is None: + if copy: + result = block.copy() + else: + result = block + else: + result = block.reindex_axis(self.indexer, mask, need_masking, + axis=axis) + + result.ref_items = ref_items + return result + +def _may_need_upcasting(blocks): + for block in blocks: + if isinstance(block, (IntBlock, BoolBlock)): + return True + return False + + +def _upcast_blocks(blocks): + """ + Upcast and consolidate if necessary + """ + new_blocks = [] + for block in blocks: + if isinstance(block, IntBlock): + newb = make_block(block.values.astype(float), block.items, + block.ref_items) + elif isinstance(block, BoolBlock): + newb = make_block(block.values.astype(object), block.items, + block.ref_items) + else: + newb = block + new_blocks.append(newb) + + # use any ref_items + return _consolidate(new_blocks, newb.ref_items) + +def _get_all_block_kinds(blockmaps): + kinds = set() + for mapping in blockmaps: + kinds |= set(mapping) + return kinds + +#---------------------------------------------------------------------- +# Concatenate DataFrame objects + +def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, + keys=None, levels=None, names=None, verify_integrity=False): + """ + Concatenate pandas objects along a particular axis with optional set logic + along the other axes. Can also add a layer of hierarchical indexing on the + concatenation axis, which may be useful if the labels are the same (or + overlapping) on the passed axis number + + Parameters + ---------- + objs : list or dict of Series, DataFrame, or Panel objects + If a dict is passed, the sorted keys will be used as the `keys` + argument, unless it is passed, in which case the values will be + selected (see below). Any None objects will be dropped silently unless + they are all None in which case an Exception will be raised + axis : {0, 1, ...}, default 0 + The axis to concatenate along + join : {'inner', 'outer'}, default 'outer' + How to handle indexes on other axis(es) + join_axes : list of Index objects + Specific indexes to use for the other n - 1 axes instead of performing + inner/outer set logic + verify_integrity : boolean, default False + Check whether the new concatenated axis contains duplicates. This can + be very expensive relative to the actual data concatenation + keys : sequence, default None + If multiple levels passed, should contain tuples. Construct + hierarchical index using the passed keys as the outermost level + levels : list of sequences, default None + Specific levels (unique values) to use for constructing a + MultiIndex. Otherwise they will be inferred from the keys + names : list, default None + Names for the levels in the resulting hierarchical index + ignore_index : boolean, default False + If True, do not use the index values on the concatenation axis. The + resulting axis will be labeled 0, ..., n - 1. This is useful if you are + concatenating objects where the concatenation axis does not have + meaningful indexing information. + + Notes + ----- + The keys, levels, and names arguments are all optional + + Returns + ------- + concatenated : type of objects + """ + op = _Concatenator(objs, axis=axis, join_axes=join_axes, + ignore_index=ignore_index, join=join, + keys=keys, levels=levels, names=names, + verify_integrity=verify_integrity) + return op.get_result() + + +class _Concatenator(object): + """ + Orchestrates a concatenation operation for BlockManagers, with little hacks + to support sparse data structures, etc. + """ + + def __init__(self, objs, axis=0, join='outer', join_axes=None, + keys=None, levels=None, names=None, + ignore_index=False, verify_integrity=False): + if join == 'outer': + self.intersect = False + elif join == 'inner': + self.intersect = True + else: # pragma: no cover + raise ValueError('Only can inner (intersect) or outer (union) join ' + 'the other axis') + + if isinstance(objs, dict): + if keys is None: + keys = sorted(objs) + objs = [objs[k] for k in keys] + + # filter Nones + objs = [obj for obj in objs if obj is not None] + + if len(objs) == 0: + raise Exception('All objects passed were None') + + # consolidate data + for obj in objs: + if isinstance(obj, NDFrame): + obj.consolidate(inplace=True) + self.objs = objs + + sample = objs[0] + + # Need to flip BlockManager axis in the DataFrame special case + if isinstance(sample, DataFrame): + axis = 1 if axis == 0 else 0 + + self._is_series = isinstance(sample, Series) + assert(0 <= axis <= sample.ndim) + + # note: this is the BlockManager axis (since DataFrame is transposed) + self.axis = axis + + self.join_axes = join_axes + + self.keys = keys + self.names = names + self.levels = levels + + self.ignore_index = ignore_index + self.verify_integrity = verify_integrity + + self.new_axes = self._get_new_axes() + + def get_result(self): + if self._is_series and self.axis == 0: + new_data = np.concatenate([x.values for x in self.objs]) + name = com._consensus_name_attr(self.objs) + return Series(new_data, index=self.new_axes[0], name=name) + elif self._is_series: + data = dict(zip(self.new_axes[1], self.objs)) + return DataFrame(data, index=self.new_axes[0], + columns=self.new_axes[1]) + else: + new_data = self._get_concatenated_data() + return self.objs[0]._from_axes(new_data, self.new_axes) + + def _get_fresh_axis(self): + return Index(np.arange(len(self._get_concat_axis()))) + + def _get_concatenated_data(self): + try: + # need to conform to same other (joined) axes for block join + reindexed_data = self._get_reindexed_data() + + blockmaps = [] + for data in reindexed_data: + data = data.consolidate() + type_map = dict((type(blk), blk) for blk in data.blocks) + blockmaps.append(type_map) + kinds = _get_all_block_kinds(blockmaps) + + new_blocks = [] + for kind in kinds: + klass_blocks = [mapping.get(kind) for mapping in blockmaps] + stacked_block = self._concat_blocks(klass_blocks) + new_blocks.append(stacked_block) + + if self.axis == 0 and self.ignore_index: + self.new_axes[0] = self._get_fresh_axis() + + for blk in new_blocks: + blk.ref_items = self.new_axes[0] + + new_data = BlockManager(new_blocks, self.new_axes) + except Exception: # EAFP + # should not be possible to fail here for the expected reason with + # axis = 0 + if self.axis == 0: # pragma: no cover + raise + + new_data = {} + for item in self.new_axes[0]: + new_data[item] = self._concat_single_item(item) + + return new_data + + def _get_reindexed_data(self): + # HACK: ugh + + reindexed_data = [] + if isinstance(self.objs[0], SparseDataFrame): + pass + else: + axes_to_reindex = list(enumerate(self.new_axes)) + axes_to_reindex.pop(self.axis) + + for obj in self.objs: + data = obj._data + for i, ax in axes_to_reindex: + data = data.reindex_axis(ax, axis=i, copy=False) + reindexed_data.append(data) + + return reindexed_data + + def _concat_blocks(self, blocks): + values_list = [b.values for b in blocks if b is not None] + if isinstance(blocks[0], DatetimeBlock): + # hack around NumPy 1.6 bug + concat_values = np.concatenate([x.view(np.int64) + for x in values_list], + axis=self.axis) + concat_values = concat_values.view(np.dtype('M8[ns]')) + else: + concat_values = np.concatenate(values_list, axis=self.axis) + + if self.axis > 0: + # Not safe to remove this check, need to profile + if not _all_indexes_same([b.items for b in blocks]): + raise Exception('dtypes are not consistent throughout ' + 'DataFrames') + return make_block(concat_values, blocks[0].items, self.new_axes[0]) + else: + offsets = np.r_[0, np.cumsum([len(x._data.axes[0]) for + x in self.objs])] + indexer = np.concatenate([offsets[i] + b.ref_locs + for i, b in enumerate(blocks) + if b is not None]) + if self.ignore_index: + concat_items = indexer + else: + concat_items = self.new_axes[0].take(indexer) + + if self.ignore_index: + ref_items = self._get_fresh_axis() + return make_block(concat_values, concat_items, ref_items) + + return make_block(concat_values, concat_items, self.new_axes[0]) + + def _concat_single_item(self, item): + all_values = [] + dtypes = set() + for obj in self.objs: + try: + values = obj._data.get(item) + dtypes.add(values.dtype) + all_values.append(values) + except KeyError: + all_values.append(None) + + # this stinks + have_object = False + for dtype in dtypes: + if issubclass(dtype.type, (np.object_, np.bool_)): + have_object = True + if have_object: + empty_dtype = np.object_ + else: + empty_dtype = np.float64 + + to_concat = [] + for obj, item_values in zip(self.objs, all_values): + if item_values is None: + shape = obj._data.shape[1:] + missing_arr = np.empty(shape, dtype=empty_dtype) + missing_arr.fill(np.nan) + to_concat.append(missing_arr) + else: + to_concat.append(item_values) + + # this method only gets called with axis >= 1 + assert(self.axis >= 1) + return np.concatenate(to_concat, axis=self.axis - 1) + + def _get_result_dim(self): + if self._is_series and self.axis == 1: + return 2 + else: + return self.objs[0].ndim + + def _get_new_axes(self): + ndim = self._get_result_dim() + new_axes = [None] * ndim + + if self.join_axes is None: + for i in range(ndim): + if i == self.axis: + continue + new_axes[i] = self._get_comb_axis(i) + else: + assert(len(self.join_axes) == ndim - 1) + + # ufff... + indices = range(ndim) + indices.remove(self.axis) + + for i, ax in zip(indices, self.join_axes): + new_axes[i] = ax + + if self.ignore_index: + concat_axis = None + else: + concat_axis = self._get_concat_axis() + + new_axes[self.axis] = concat_axis + + return new_axes + + def _get_comb_axis(self, i): + if self._is_series: + all_indexes = [x.index for x in self.objs] + else: + all_indexes = [x._data.axes[i] for x in self.objs] + + return _get_combined_index(all_indexes, intersect=self.intersect) + + def _get_concat_axis(self): + if self._is_series: + if self.axis == 0: + indexes = [x.index for x in self.objs] + elif self.keys is None: + return Index(np.arange(len(self.objs))) + else: + return _ensure_index(self.keys) + else: + indexes = [x._data.axes[self.axis] for x in self.objs] + + if self.keys is None: + concat_axis = _concat_indexes(indexes) + else: + concat_axis = _make_concat_multiindex(indexes, self.keys, + self.levels, self.names) + + self._maybe_check_integrity(concat_axis) + + return concat_axis + + def _maybe_check_integrity(self, concat_index): + if self.verify_integrity: + if not concat_index.is_unique: + overlap = concat_index.get_duplicates() + raise Exception('Indexes have overlapping values: %s' + % str(overlap)) + + +def _concat_indexes(indexes): + return indexes[0].append(indexes[1:]) + +def _make_concat_multiindex(indexes, keys, levels=None, names=None): + if ((levels is None and isinstance(keys[0], tuple)) or + (levels is not None and len(levels) > 1)): + zipped = zip(*keys) + if names is None: + names = [None] * len(zipped) + + if levels is None: + levels = [Factor.from_array(zp).levels for zp in zipped] + else: + levels = [_ensure_index(x) for x in levels] + else: + zipped = [keys] + if names is None: + names = [None] + + if levels is None: + levels = [_ensure_index(keys)] + else: + levels = [_ensure_index(x) for x in levels] + + if not _all_indexes_same(indexes): + label_list = [] + + # things are potentially different sizes, so compute the exact labels + # for each level and pass those to MultiIndex.from_arrays + + for hlevel, level in zip(zipped, levels): + to_concat = [] + for key, index in zip(hlevel, indexes): + try: + i = level.get_loc(key) + except KeyError: + raise ValueError('Key %s not in level %s' % (str(key), str(level))) + + to_concat.append(np.repeat(i, len(index))) + label_list.append(np.concatenate(to_concat)) + + concat_index = _concat_indexes(indexes) + + # these go at the end + if isinstance(concat_index, MultiIndex): + levels.extend(concat_index.levels) + label_list.extend(concat_index.labels) + else: + factor = Factor.from_array(concat_index) + levels.append(factor.levels) + label_list.append(factor.labels) + + if len(names) == len(levels): + names = list(names) + else: + # also copies + names = names + _get_consensus_names(indexes) + + return MultiIndex(levels=levels, labels=label_list, names=names) + + new_index = indexes[0] + n = len(new_index) + kpieces = len(indexes) + + # also copies + new_names = list(names) + new_levels = list(levels) + + # construct labels + new_labels = [] + + # do something a bit more speedy + + for hlevel, level in zip(zipped, levels): + hlevel = _ensure_index(hlevel) + mapped = level.get_indexer(hlevel) + + mask = mapped == -1 + if mask.any(): + raise ValueError('Values not found in passed level: %s' + % str(hlevel[mask])) + + new_labels.append(np.repeat(mapped, n)) + + if isinstance(new_index, MultiIndex): + new_levels.extend(new_index.levels) + new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) + else: + new_levels.append(new_index) + new_labels.append(np.tile(np.arange(n), kpieces)) + + if len(new_names) < len(new_levels): + new_names.extend(new_index.names) + + return MultiIndex(levels=new_levels, labels=new_labels, names=new_names) + + +def _should_fill(lname, rname): + if not isinstance(lname, basestring) or not isinstance(rname, basestring): + return True + return lname == rname + + + +def _any(x): + return x is not None and len(x) > 0 and any([y is not None for y in x]) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py new file mode 100644 index 00000000..146cba82 --- /dev/null +++ b/pandas/tools/pivot.py @@ -0,0 +1,300 @@ +# pylint: disable=E1103 + +from pandas import Series, DataFrame +from pandas.core.index import MultiIndex +from pandas.core.reshape import _unstack_multiple +from pandas.tools.merge import concat +import pandas.core.common as com +import numpy as np + + +def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean', + fill_value=None, margins=False): + """ + Create a spreadsheet-style pivot table as a DataFrame. The levels in the + pivot table will be stored in MultiIndex objects (hierarchical indexes) on + the index and columns of the result DataFrame + + Parameters + ---------- + data : DataFrame + values : column to aggregate, optional + rows : list of column names or arrays to group on + Keys to group on the x-axis of the pivot table + cols : list of column names or arrays to group on + Keys to group on the y-axis of the pivot table + aggfunc : function, default numpy.mean, or list of functions + If list of functions passed, the resulting pivot table will have + hierarchical columns whose top level are the function names (inferred + from the function objects themselves) + fill_value : scalar, default None + Value to replace missing values with + margins : boolean, default False + Add all row / columns (e.g. for subtotal / grand totals) + + Examples + -------- + >>> df + A B C D + 0 foo one small 1 + 1 foo one large 2 + 2 foo one large 2 + 3 foo two small 3 + 4 foo two small 3 + 5 bar one large 4 + 6 bar one small 5 + 7 bar two small 6 + 8 bar two large 7 + + >>> table = pivot_table(df, values='D', rows=['A', 'B'], + ... cols=['C'], aggfunc=np.sum) + >>> table + small large + foo one 1 4 + two 6 NaN + bar one 5 4 + two 6 7 + + Returns + ------- + table : DataFrame + """ + rows = _convert_by(rows) + cols = _convert_by(cols) + + if isinstance(aggfunc, list): + pieces = [] + keys = [] + for func in aggfunc: + table = pivot_table(data, values=values, rows=rows, cols=cols, + fill_value=fill_value, aggfunc=func, + margins=margins) + pieces.append(table) + keys.append(func.__name__) + return concat(pieces, keys=keys, axis=1) + + keys = rows + cols + + values_passed = values is not None + if values_passed: + if isinstance(values, (list, tuple)): + values_multi = True + else: + values_multi = False + values = [values] + else: + values = list(data.columns.drop(keys)) + + if values_passed: + to_filter = [] + for x in keys + values: + try: + if x in data: + to_filter.append(x) + except TypeError: + pass + if len(to_filter) < len(data.columns): + data = data[to_filter] + + grouped = data.groupby(keys) + agged = grouped.agg(aggfunc) + + to_unstack = [agged.index.names[i] + for i in range(len(rows), len(keys))] + + table = agged.unstack(to_unstack) + + if isinstance(table, DataFrame): + if isinstance(table.columns, MultiIndex): + table = table.sortlevel(axis=1) + else: + table = table.sort_index(axis=1) + + if fill_value is not None: + table = table.fillna(value=fill_value) + + if margins: + table = _add_margins(table, data, values, rows=rows, + cols=cols, aggfunc=aggfunc) + + # discard the top level + if values_passed and not values_multi: + table = table[values[0]] + + return table + + +DataFrame.pivot_table = pivot_table + +def _add_margins(table, data, values, rows=None, cols=None, aggfunc=np.mean): + grand_margin = {} + for k, v in data[values].iteritems(): + try: + if isinstance(aggfunc, basestring): + grand_margin[k] = getattr(v, aggfunc)() + else: + grand_margin[k] = aggfunc(v) + except TypeError: + pass + + if len(cols) > 0: + # need to "interleave" the margins + table_pieces = [] + margin_keys = [] + + + def _all_key(key): + return (key, 'All') + ('',) * (len(cols) - 1) + + if len(rows) > 0: + margin = data[rows + values].groupby(rows).agg(aggfunc) + cat_axis = 1 + for key, piece in table.groupby(level=0, axis=cat_axis): + all_key = _all_key(key) + piece[all_key] = margin[key] + table_pieces.append(piece) + margin_keys.append(all_key) + else: + margin = grand_margin + cat_axis = 0 + for key, piece in table.groupby(level=0, axis=cat_axis): + all_key = _all_key(key) + table_pieces.append(piece) + table_pieces.append(Series(margin[key], index=[all_key])) + margin_keys.append(all_key) + + result = concat(table_pieces, axis=cat_axis) + + if len(rows) == 0: + return result + else: + result = table + margin_keys = table.columns + + if len(cols) > 0: + row_margin = data[cols + values].groupby(cols).agg(aggfunc) + row_margin = row_margin.stack() + + # slight hack + new_order = [len(cols)] + range(len(cols)) + row_margin.index = row_margin.index.reorder_levels(new_order) + else: + row_margin = Series(np.nan, index=result.columns) + + key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' + + row_margin = row_margin.reindex(result.columns) + # populate grand margin + for k in margin_keys: + if len(cols) > 0: + row_margin[k] = grand_margin[k[0]] + else: + row_margin[k] = grand_margin[k] + + margin_dummy = DataFrame(row_margin, columns=[key]).T + + row_names = result.index.names + result = result.append(margin_dummy) + result.index.names = row_names + + return result + +def _convert_by(by): + if by is None: + by = [] + elif (np.isscalar(by) or isinstance(by, np.ndarray) + or hasattr(by, '__call__')): + by = [by] + else: + by = list(by) + return by + +def crosstab(rows, cols, values=None, rownames=None, colnames=None, + aggfunc=None, margins=False): + """ + Compute a simple cross-tabulation of two (or more) factors. By default + computes a frequency table of the factors unless an array of values and an + aggregation function are passed + + Parameters + ---------- + rows : array-like, Series, or list of arrays/Series + Values to group by in the rows + cols : array-like, Series, or list of arrays/Series + Values to group by in the columns + values : array-like, optional + Array of values to aggregate according to the factors + aggfunc : function, optional + If no values array is passed, computes a frequency table + rownames : sequence, default None + If passed, must match number of row arrays passed + colnames : sequence, default None + If passed, must match number of column arrays passed + margins : boolean, default False + Add row/column margins (subtotals) + + Notes + ----- + Any Series passed will have their name attributes used unless row or column + names for the cross-tabulation are specified + + Examples + -------- + >>> a + array([foo, foo, foo, foo, bar, bar, + bar, bar, foo, foo, foo], dtype=object) + >>> b + array([one, one, one, two, one, one, + one, two, two, two, one], dtype=object) + >>> c + array([dull, dull, shiny, dull, dull, shiny, + shiny, dull, shiny, shiny, shiny], dtype=object) + + >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + b one two + c dull shiny dull shiny + a + bar 1 2 1 0 + foo 2 2 1 2 + + Returns + ------- + crosstab : DataFrame + """ + rows = com._maybe_make_list(rows) + cols = com._maybe_make_list(cols) + + rownames = _get_names(rows, rownames, prefix='row') + colnames = _get_names(cols, colnames, prefix='col') + + data = {} + data.update(zip(rownames, rows)) + data.update(zip(colnames, cols)) + + if values is None: + df = DataFrame(data) + df['__dummy__'] = 0 + table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, + aggfunc=len, margins=margins) + return table.fillna(0).astype(np.int64) + else: + data['__dummy__'] = values + df = DataFrame(data) + table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, + aggfunc=aggfunc, margins=margins) + return table + +def _get_names(arrs, names, prefix='row'): + if names is None: + names = [] + for i, arr in enumerate(arrs): + if isinstance(arr, Series) and arr.name is not None: + names.append(arr.name) + else: + names.append('%s_%d' % (prefix, i)) + else: + assert(len(names) == len(arrs)) + if not isinstance(names, list): + names = list(names) + + return names diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py new file mode 100644 index 00000000..9ab4dccb --- /dev/null +++ b/pandas/tools/plotting.py @@ -0,0 +1,1555 @@ +# being a bit too dynamic +# pylint: disable=E1101 +from itertools import izip +import datetime + +import numpy as np + +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +from pandas.core.index import MultiIndex +from pandas.core.series import Series, remove_na +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex +from pandas.tseries.frequencies import get_period_alias, get_base_alias +from pandas.tseries.offsets import DateOffset + +try: # mpl optional + import pandas.tseries.converter as conv + conv.register() +except ImportError: + pass + +def _get_standard_kind(kind): + return {'density' : 'kde'}.get(kind, kind) + + +def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, + diagonal='hist', marker='.', **kwds): + """ + Draw a matrix of scatter plots. + + Parameters + ---------- + alpha : amount of transparency applied + figsize : a tuple (width, height) in inches + ax : Matplotlib axis object + grid : setting this to True will show the grid + diagonal : pick between 'kde' and 'hist' for + either Kernel Density Estimation or Histogram + plon in the diagonal + kwds : other plotting keyword arguments + To be passed to scatter function + + Examples + -------- + >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) + >>> scatter_matrix(df, alpha=0.2) + """ + df = frame._get_numeric_data() + n = df.columns.size + fig, axes = _subplots(nrows=n, ncols=n, figsize=figsize, ax=ax, + squeeze=False) + + # no gaps between subplots + fig.subplots_adjust(wspace=0, hspace=0) + + mask = com.notnull(df) + + marker = _get_marker_compat(marker) + + for i, a in zip(range(n), df.columns): + for j, b in zip(range(n), df.columns): + if i == j: + values = df[a].values[mask[a].values] + + # Deal with the diagonal by drawing a histogram there. + if diagonal == 'hist': + axes[i, j].hist(values) + elif diagonal in ('kde', 'density'): + from scipy.stats import gaussian_kde + y = values + gkde = gaussian_kde(y) + ind = np.linspace(y.min(), y.max(), 1000) + axes[i, j].plot(ind, gkde.evaluate(ind), **kwds) + else: + common = (mask[a] & mask[b]).values + + axes[i, j].scatter(df[b][common], df[a][common], + marker=marker, alpha=alpha, **kwds) + + axes[i, j].set_xlabel('') + axes[i, j].set_ylabel('') + axes[i, j].set_xticklabels([]) + axes[i, j].set_yticklabels([]) + ticks = df.index + + is_datetype = ticks.inferred_type in ('datetime', 'date', + 'datetime64') + + if ticks.is_numeric() or is_datetype: + """ + Matplotlib supports numeric values or datetime objects as + xaxis values. Taking LBYL approach here, by the time + matplotlib raises exception when using non numeric/datetime + values for xaxis, several actions are already taken by plt. + """ + ticks = ticks._mpl_repr() + + # setup labels + if i == 0 and j % 2 == 1: + axes[i, j].set_xlabel(b, visible=True) + #axes[i, j].xaxis.set_visible(True) + axes[i, j].set_xlabel(b) + axes[i, j].set_xticklabels(ticks) + axes[i, j].xaxis.set_ticks_position('top') + axes[i, j].xaxis.set_label_position('top') + if i == n - 1 and j % 2 == 0: + axes[i, j].set_xlabel(b, visible=True) + #axes[i, j].xaxis.set_visible(True) + axes[i, j].set_xlabel(b) + axes[i, j].set_xticklabels(ticks) + axes[i, j].xaxis.set_ticks_position('bottom') + axes[i, j].xaxis.set_label_position('bottom') + if j == 0 and i % 2 == 0: + axes[i, j].set_ylabel(a, visible=True) + #axes[i, j].yaxis.set_visible(True) + axes[i, j].set_ylabel(a) + axes[i, j].set_yticklabels(ticks) + axes[i, j].yaxis.set_ticks_position('left') + axes[i, j].yaxis.set_label_position('left') + if j == n - 1 and i % 2 == 1: + axes[i, j].set_ylabel(a, visible=True) + #axes[i, j].yaxis.set_visible(True) + axes[i, j].set_ylabel(a) + axes[i, j].set_yticklabels(ticks) + axes[i, j].yaxis.set_ticks_position('right') + axes[i, j].yaxis.set_label_position('right') + + axes[i, j].grid(b=grid) + + return axes + +def _gca(): + import matplotlib.pyplot as plt + return plt.gca() + +def _gcf(): + import matplotlib.pyplot as plt + return plt.gcf() + +def _get_marker_compat(marker): + import matplotlib.lines as mlines + import matplotlib as mpl + if mpl.__version__ < '1.1.0' and marker == '.': + return 'o' + if marker not in mlines.lineMarkers: + return 'o' + return marker + +def andrews_curves(data, class_column, ax=None, samples=200): + """ + Parameters: + data: A DataFrame containing data to be plotted, preferably + normalized to (0.0, 1.0). + class_column: Name of the column containing class names. + samples: Number of points to plot in each curve. + """ + from math import sqrt, pi, sin, cos + import matplotlib.pyplot as plt + import random + def function(amplitudes): + def f(x): + x1 = amplitudes[0] + result = x1 / sqrt(2.0) + harmonic = 1.0 + for x_even, x_odd in zip(amplitudes[1::2], amplitudes[2::2]): + result += (x_even * sin(harmonic * x) + + x_odd * cos(harmonic * x)) + harmonic += 1.0 + if len(amplitudes) % 2 != 0: + result += amplitudes[-1] * sin(harmonic * x) + return result + return f + def random_color(column): + random.seed(column) + return [random.random() for _ in range(3)] + n = len(data) + classes = set(data[class_column]) + class_col = data[class_column] + columns = [data[col] for col in data.columns if (col != class_column)] + x = [-pi + 2.0 * pi * (t / float(samples)) for t in range(samples)] + used_legends = set([]) + if ax == None: + ax = plt.gca(xlim=(-pi, pi)) + for i in range(n): + row = [columns[c][i] for c in range(len(columns))] + f = function(row) + y = [f(t) for t in x] + label = None + if str(class_col[i]) not in used_legends: + label = str(class_col[i]) + used_legends.add(label) + ax.plot(x, y, color=random_color(class_col[i]), label=label) + ax.legend(loc='upper right') + ax.grid() + return ax + +def lag_plot(series, ax=None, **kwds): + """Lag plot for time series. + + Parameters: + ----------- + series: Time series + ax: Matplotlib axis object, optional + kwds: Matplotlib scatter method keyword arguments, optional + + Returns: + -------- + ax: Matplotlib axis object + """ + import matplotlib.pyplot as plt + data = series.values + y1 = data[:-1] + y2 = data[1:] + if ax == None: + ax = plt.gca() + ax.set_xlabel("y(t)") + ax.set_ylabel("y(t + 1)") + ax.scatter(y1, y2, **kwds) + return ax + +def autocorrelation_plot(series, ax=None): + """Autocorrelation plot for time series. + + Parameters: + ----------- + series: Time series + ax: Matplotlib axis object, optional + + Returns: + ----------- + ax: Matplotlib axis object + """ + import matplotlib.pyplot as plt + n = len(series) + data = np.asarray(series) + if ax == None: + ax = plt.gca(xlim=(1, n), ylim=(-1.0, 1.0)) + mean = np.mean(data) + c0 = np.sum((data - mean) ** 2) / float(n) + def r(h): + return ((data[:n - h] - mean) * (data[h:] - mean)).sum() / float(n) / c0 + x = np.arange(n) + 1 + y = map(r, x) + z95 = 1.959963984540054 + z99 = 2.5758293035489004 + ax.axhline(y=z99/np.sqrt(n), linestyle='--', color='grey') + ax.axhline(y=z95/np.sqrt(n), color='grey') + ax.axhline(y=0.0, color='black') + ax.axhline(y=-z95/np.sqrt(n), color='grey') + ax.axhline(y=-z99/np.sqrt(n), linestyle='--', color='grey') + ax.set_xlabel("Lag") + ax.set_ylabel("Autocorrelation") + ax.plot(x, y) + ax.grid() + return ax + +def grouped_hist(data, column=None, by=None, ax=None, bins=50, log=False, + figsize=None, layout=None, sharex=False, sharey=False, + rot=90): + """ + + Returns + ------- + fig : matplotlib.Figure + """ + # if isinstance(data, DataFrame): + # data = data[column] + + def plot_group(group, ax): + ax.hist(group.dropna(), bins=bins) + + fig, axes = _grouped_plot(plot_group, data, column=column, + by=by, sharex=sharex, sharey=sharey, + figsize=figsize, layout=layout, rot=rot) + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, + hspace=0.3, wspace=0.2) + return fig + +class MPLPlot(object): + """ + Base class for assembling a pandas plot using matplotlib + + Parameters + ---------- + data : + + """ + _default_rot = 0 + + _pop_attributes = ['label', 'style', 'logy', 'logx', 'loglog'] + _attr_defaults = {'logy': False, 'logx': False, 'loglog': False} + + def __init__(self, data, kind=None, by=None, subplots=False, sharex=True, + sharey=False, use_index=True, + figsize=None, grid=None, legend=True, rot=None, + ax=None, fig=None, title=None, xlim=None, ylim=None, + xticks=None, yticks=None, + sort_columns=False, fontsize=None, + secondary_y=False, **kwds): + + self.data = data + self.by = by + + self.kind = kind + + self.sort_columns = sort_columns + + self.subplots = subplots + self.sharex = sharex + self.sharey = sharey + self.figsize = figsize + + self.xticks = xticks + self.yticks = yticks + self.xlim = xlim + self.ylim = ylim + self.title = title + self.use_index = use_index + + self.fontsize = fontsize + self.rot = rot + + if grid is None: + grid = False if secondary_y else True + + self.grid = grid + self.legend = legend + + for attr in self._pop_attributes: + value = kwds.pop(attr, self._attr_defaults.get(attr, None)) + setattr(self, attr, value) + + self.ax = ax + self.fig = fig + self.axes = None + + self.secondary_y = secondary_y + + self.kwds = kwds + + def _iter_data(self): + from pandas.core.frame import DataFrame + if isinstance(self.data, (Series, np.ndarray)): + yield com._stringify(self.label), np.asarray(self.data) + elif isinstance(self.data, DataFrame): + df = self.data + + if self.sort_columns: + columns = com._try_sort(df.columns) + else: + columns = df.columns + + for col in columns: + empty = df[col].count() == 0 + # is this right? + values = df[col].values if not empty else np.zeros(len(df)) + + col = com._stringify(col) + yield col, values + + @property + def nseries(self): + if self.data.ndim == 1: + return 1 + else: + return self.data.shape[1] + + def draw(self): + self.plt.draw_if_interactive() + + def generate(self): + self._args_adjust() + self._compute_plot_data() + self._setup_subplots() + self._make_plot() + self._post_plot_logic() + self._adorn_subplots() + + def _args_adjust(self): + pass + + def _maybe_right_yaxis(self, ax): + ypos = ax.get_yaxis().get_ticks_position().strip().lower() + + if self.secondary_y and ypos != 'right': + orig_ax = ax + ax = ax.twinx() + if len(orig_ax.get_lines()) == 0: # no data on left y + orig_ax.get_yaxis().set_visible(False) + else: + ax.get_yaxis().set_visible(True) + + return ax + + def _setup_subplots(self): + if self.subplots: + nrows, ncols = self._get_layout() + if self.ax is None: + fig, axes = _subplots(nrows=nrows, ncols=ncols, + sharex=self.sharex, sharey=self.sharey, + figsize=self.figsize, + secondary_y=self.secondary_y, + data=self.data) + else: + fig, axes = _subplots(nrows=nrows, ncols=ncols, + sharex=self.sharex, sharey=self.sharey, + figsize=self.figsize, ax=self.ax, + secondary_y=self.secondary_y, + data=self.data) + else: + if self.ax is None: + fig = self.plt.figure(figsize=self.figsize) + ax = fig.add_subplot(111) + ax = self._maybe_right_yaxis(ax) + self.ax = ax + else: + fig = self.ax.get_figure() + self.ax = self._maybe_right_yaxis(self.ax) + + axes = [self.ax] + + self.fig = fig + self.axes = axes + + def _get_layout(self): + return (len(self.data.columns), 1) + + def _compute_plot_data(self): + pass + + def _make_plot(self): + raise NotImplementedError + + def _post_plot_logic(self): + pass + + def _adorn_subplots(self): + if self.subplots: + to_adorn = self.axes + else: + to_adorn = [self.ax] + + # todo: sharex, sharey handling? + + for ax in to_adorn: + if self.yticks is not None: + ax.set_yticks(self.yticks) + + if self.xticks is not None: + ax.set_xticks(self.xticks) + + if self.ylim is not None: + ax.set_ylim(self.ylim) + + if self.xlim is not None: + ax.set_xlim(self.xlim) + + ax.grid(self.grid) + + if self.legend and not self.subplots: + self.ax.legend(loc='best', title=self.legend_title) + + if self.title: + if self.subplots: + self.fig.suptitle(self.title) + else: + self.ax.set_title(self.title) + + if self._need_to_set_index: + labels = [_stringify(key) for key in self.data.index] + labels = dict(zip(range(len(self.data.index)), labels)) + + for ax_ in self.axes: + # ax_.set_xticks(self.xticks) + xticklabels = [labels.get(x, '') for x in ax_.get_xticks()] + ax_.set_xticklabels(xticklabels, rotation=self.rot) + + @property + def legend_title(self): + if hasattr(self.data, 'columns'): + if not isinstance(self.data.columns, MultiIndex): + name = self.data.columns.name + if name is not None: + name = str(name) + return name + else: + stringified = map(str, self.data.columns.names) + return ','.join(stringified) + else: + return None + + @cache_readonly + def plt(self): + import matplotlib.pyplot as plt + return plt + + _need_to_set_index = False + + def _get_xticks(self, convert_period=False): + index = self.data.index + is_datetype = index.inferred_type in ('datetime', 'date', + 'datetime64', 'time') + + if self.use_index: + if convert_period and isinstance(index, PeriodIndex): + index = index.to_timestamp() + x = index._mpl_repr() + elif index.is_numeric() or is_datetype: + """ + Matplotlib supports numeric values or datetime objects as + xaxis values. Taking LBYL approach here, by the time + matplotlib raises exception when using non numeric/datetime + values for xaxis, several actions are already taken by plt. + """ + x = index._mpl_repr() + else: + self._need_to_set_index = True + x = range(len(index)) + else: + x = range(len(index)) + + return x + + def _get_plot_function(self): + if self.logy: + plotf = self.plt.Axes.semilogy + elif self.logx: + plotf = self.plt.Axes.semilogx + elif self.loglog: + plotf = self.plt.Axes.loglog + else: + plotf = self.plt.Axes.plot + + return plotf + + def _get_index_name(self): + if isinstance(self.data.index, MultiIndex): + name = self.data.index.names + if any(x is not None for x in name): + name = ','.join([str(x) for x in name]) + else: + name = None + else: + name = self.data.index.name + if name is not None: + name = str(name) + + return name + + def _get_ax_and_style(self, i): + if self.subplots: + ax = self.axes[i] + style = 'k' + else: + style = '' # empty string ignored + ax = self.ax + + return ax, style + +class KdePlot(MPLPlot): + def __init__(self, data, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + + def _make_plot(self): + from scipy.stats import gaussian_kde + plotf = self._get_plot_function() + for i, (label, y) in enumerate(self._iter_data()): + + ax, style = self._get_ax_and_style(i) + + if self.style: + style = self.style + gkde = gaussian_kde(y) + sample_range = max(y) - min(y) + ind = np.linspace(min(y) - 0.5 * sample_range, + max(y) + 0.5 * sample_range, 1000) + ax.set_ylabel("Density") + plotf(ax, ind, gkde.evaluate(ind), style, label=label, **self.kwds) + ax.grid(self.grid) + + def _post_plot_logic(self): + df = self.data + + if self.subplots and self.legend: + self.axes[0].legend(loc='best') + +class LinePlot(MPLPlot): + + def __init__(self, data, **kwargs): + MPLPlot.__init__(self, data, **kwargs) + + def _index_freq(self): + from pandas.core.frame import DataFrame + if isinstance(self.data, (Series, DataFrame)): + freq = (getattr(self.data.index, 'freq', None) + or getattr(self.data.index, 'inferred_freq', None)) + return freq + + def _is_dynamic_freq(self, freq): + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = get_base_alias(freq) + freq = get_period_alias(freq) + return freq is not None + + def _use_dynamic_x(self): + freq = self._index_freq() + + ax, _ = self._get_ax_and_style(0) + ax_freq = getattr(ax, 'freq', None) + if freq is None: # convert irregular if axes has freq info + freq = ax_freq + else: # do not use tsplot if irregular was plotted first + if (ax_freq is None) and (len(ax.get_lines()) > 0): + return False + + return (freq is not None) and self._is_dynamic_freq(freq) + + def _make_plot(self): + # this is slightly deceptive + if self.use_index and self._use_dynamic_x(): + data = self._maybe_convert_index(self.data) + self._make_ts_plot(data) + else: + x = self._get_xticks(convert_period=True) + + plotf = self._get_plot_function() + + for i, (label, y) in enumerate(self._iter_data()): + + ax, style = self._get_ax_and_style(i) + + if self.style: + style = self.style + + mask = com.isnull(y) + if mask.any(): + y = np.ma.array(y) + y = np.ma.masked_where(mask, y) + + plotf(ax, x, y, style, label=label, **self.kwds) + ax.grid(self.grid) + + def _maybe_convert_index(self, data): + # tsplot converts automatically, but don't want to convert index + # over and over for DataFrames + from pandas.core.frame import DataFrame + if (isinstance(data.index, DatetimeIndex) and + isinstance(data, DataFrame)): + freq = getattr(data.index, 'freq', None) + + if freq is None: + freq = getattr(data.index, 'inferred_freq', None) + + if isinstance(freq, DateOffset): + freq = freq.rule_code + + freq = get_period_alias(freq) + + if freq is None: + ax, _ = self._get_ax_and_style(0) + freq = getattr(ax, 'freq', None) + + if freq is None: + raise ValueError('Could not get frequency alias for plotting') + + data = DataFrame(data.values, + index=data.index.to_period(freq=freq), + columns=data.columns) + return data + + def _make_ts_plot(self, data, **kwargs): + from pandas.tseries.plotting import tsplot + + plotf = self._get_plot_function() + + if isinstance(data, Series): + ax, _ = self._get_ax_and_style(0) #self.axes[0] + + label = com._stringify(self.label) + tsplot(data, plotf, ax=ax, label=label, style=self.style, + **kwargs) + ax.grid(self.grid) + else: + for i, col in enumerate(data.columns): + ax, _ = self._get_ax_and_style(i) + label = com._stringify(col) + tsplot(data[col], plotf, ax=ax, label=label, **kwargs) + ax.grid(self.grid) + + # self.fig.subplots_adjust(wspace=0, hspace=0) + + + def _post_plot_logic(self): + df = self.data + + if self.legend: + if self.subplots: + for ax in self.axes: + ax.legend(loc='best') + else: + self.axes[0].legend(loc='best') + + condition = (not self._use_dynamic_x + and df.index.is_all_dates + and not self.subplots + or (self.subplots and self.sharex)) + + index_name = self._get_index_name() + + for ax in self.axes: + if condition: + format_date_labels(ax) + + if index_name is not None: + ax.set_xlabel(index_name) + +class BarPlot(MPLPlot): + _default_rot = {'bar' : 90, 'barh' : 0} + + def __init__(self, data, **kwargs): + self.stacked = kwargs.pop('stacked', False) + self.ax_pos = np.arange(len(data)) + 0.25 + MPLPlot.__init__(self, data, **kwargs) + + def _args_adjust(self): + if self.rot is None: + self.rot = self._default_rot[self.kind] + + if self.fontsize is None: + if len(self.data) < 10: + self.fontsize = 12 + else: + self.fontsize = 10 + + @property + def bar_f(self): + if self.kind == 'bar': + def f(ax, x, y, w, start=None, **kwds): + return ax.bar(x, y, w, bottom=start, **kwds) + elif self.kind == 'barh': + def f(ax, x, y, w, start=None, **kwds): + return ax.barh(x, y, w, left=start, **kwds) + else: + raise NotImplementedError + + return f + + def _make_plot(self): + colors = self.kwds.get('color', 'brgyk') + rects = [] + labels = [] + + ax, _ = self._get_ax_and_style(0) #self.axes[0] + + bar_f = self.bar_f + + pos_prior = neg_prior = np.zeros(len(self.data)) + + K = self.nseries + + for i, (label, y) in enumerate(self._iter_data()): + + kwds = self.kwds.copy() + kwds['color'] = colors[i % len(colors)] + + if self.subplots: + ax, _ = self._get_ax_and_style(i) #self.axes[i] + rect = bar_f(ax, self.ax_pos, y, 0.5, start=pos_prior, + linewidth=1, **kwds) + ax.set_title(label) + elif self.stacked: + mask = y > 0 + start = np.where(mask, pos_prior, neg_prior) + + rect = bar_f(ax, self.ax_pos, y, 0.5, start=start, + label=label, linewidth=1, **kwds) + pos_prior = pos_prior + np.where(mask, y, 0) + neg_prior = neg_prior + np.where(mask, 0, y) + else: + rect = bar_f(ax, self.ax_pos + i * 0.75 / K, y, 0.75 / K, + start=pos_prior, label=label, **kwds) + rects.append(rect) + labels.append(label) + + if self.legend and not self.subplots: + patches =[r[0] for r in rects] + + # Legend to the right of the plot + # ax.legend(patches, labels, bbox_to_anchor=(1.05, 1), + # loc=2, borderaxespad=0.) + # self.fig.subplots_adjust(right=0.80) + + ax.legend(patches, labels, loc='best', + title=self.legend_title) + + # self.fig.subplots_adjust(top=0.8, wspace=0, hspace=0) + + def _post_plot_logic(self): + for ax in self.axes: + str_index = [_stringify(key) for key in self.data.index] + + name = self._get_index_name() + if self.kind == 'bar': + ax.set_xlim([self.ax_pos[0] - 0.25, self.ax_pos[-1] + 1]) + ax.set_xticks(self.ax_pos + 0.375) + ax.set_xticklabels(str_index, rotation=self.rot, + fontsize=self.fontsize) + ax.axhline(0, color='k', linestyle='--') + if name is not None: + ax.set_xlabel(name) + else: + # horizontal bars + ax.set_ylim([self.ax_pos[0] - 0.25, self.ax_pos[-1] + 1]) + ax.set_yticks(self.ax_pos + 0.375) + ax.set_yticklabels(str_index, rotation=self.rot, + fontsize=self.fontsize) + ax.axvline(0, color='k', linestyle='--') + if name is not None: + ax.set_ylabel(name) + +class BoxPlot(MPLPlot): + pass + + +class HistPlot(MPLPlot): + pass + + +def plot_frame(frame=None, subplots=False, sharex=True, sharey=False, + use_index=True, + figsize=None, grid=False, legend=True, rot=None, + ax=None, title=None, + xlim=None, ylim=None, logy=False, + xticks=None, yticks=None, + kind='line', + sort_columns=False, fontsize=None, secondary_y=False, **kwds): + """ + Make line or bar plot of DataFrame's series with the index on the x-axis + using matplotlib / pylab. + + Parameters + ---------- + subplots : boolean, default False + Make separate subplots for each time series + sharex : boolean, default True + In case subplots=True, share x axis + sharey : boolean, default False + In case subplots=True, share y axis + use_index : boolean, default True + Use index as ticks for x axis + stacked : boolean, default False + If True, create stacked bar plot. Only valid for DataFrame input + sort_columns: boolean, default False + Sort column names to determine plot ordering + title : string + Title to use for the plot + grid : boolean, default True + Axis grid lines + legend : boolean, default True + Place legend on axis subplots + + ax : matplotlib axis object, default None + kind : {'line', 'bar', 'barh'} + bar : vertical bar plot + barh : horizontal bar plot + logy : boolean, default False + For line plots, use log scaling on y axis + xticks : sequence + Values to use for the xticks + yticks : sequence + Values to use for the yticks + xlim : 2-tuple/list + ylim : 2-tuple/list + rot : int, default None + Rotation for ticks + secondary_y : boolean or sequence, default False + Whether to plot on the secondary y-axis + If dict then can select which columns to plot on secondary y-axis + kwds : keywords + Options to pass to matplotlib plotting method + + Returns + ------- + ax_or_axes : matplotlib.AxesSubplot or list of them + """ + kind = _get_standard_kind(kind.lower().strip()) + if kind == 'line': + klass = LinePlot + elif kind in ('bar', 'barh'): + klass = BarPlot + elif kind == 'kde': + klass = KdePlot + else: + raise ValueError('Invalid chart type given %s' % kind) + + plot_obj = klass(frame, kind=kind, subplots=subplots, rot=rot, + legend=legend, ax=ax, fontsize=fontsize, + use_index=use_index, sharex=sharex, sharey=sharey, + xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, + title=title, grid=grid, figsize=figsize, logy=logy, + sort_columns=sort_columns, secondary_y=secondary_y, + **kwds) + plot_obj.generate() + plot_obj.draw() + if subplots: + return plot_obj.axes + else: + return plot_obj.axes[0] + + +def plot_series(series, label=None, kind='line', use_index=True, rot=None, + xticks=None, yticks=None, xlim=None, ylim=None, + ax=None, style=None, grid=None, logy=False, secondary_y=False, + **kwds): + """ + Plot the input series with the index on the x-axis using matplotlib + + Parameters + ---------- + label : label argument to provide to plot + kind : {'line', 'bar'} + rot : int, default 30 + Rotation for tick labels + use_index : boolean, default True + Plot index as axis tick labels + ax : matplotlib axis object + If not passed, uses gca() + style : string, default matplotlib default + matplotlib line style to use + + ax : matplotlib axis object + If not passed, uses gca() + kind : {'line', 'bar', 'barh'} + bar : vertical bar plot + barh : horizontal bar plot + logy : boolean, default False + For line plots, use log scaling on y axis + xticks : sequence + Values to use for the xticks + yticks : sequence + Values to use for the yticks + xlim : 2-tuple/list + ylim : 2-tuple/list + rot : int, default None + Rotation for ticks + kwds : keywords + Options to pass to matplotlib plotting method + + Notes + ----- + See matplotlib documentation online for more on this subject + """ + kind = _get_standard_kind(kind.lower().strip()) + if kind == 'line': + klass = LinePlot + elif kind in ('bar', 'barh'): + klass = BarPlot + elif kind == 'kde': + klass = KdePlot + + if ax is None: + ax = _gca() + if ax.get_yaxis().get_ticks_position().strip().lower() == 'right': + fig = _gcf() + axes = fig.get_axes() + for i in range(len(axes))[::-1]: + ax = axes[i] + ypos = ax.get_yaxis().get_ticks_position().strip().lower() + if ypos == 'left': + break + + # is there harm in this? + if label is None: + label = series.name + + plot_obj = klass(series, kind=kind, rot=rot, logy=logy, + ax=ax, use_index=use_index, style=style, + xticks=xticks, yticks=yticks, xlim=xlim, ylim=ylim, + legend=False, grid=grid, label=label, + secondary_y=secondary_y, **kwds) + + plot_obj.generate() + plot_obj.draw() + + return plot_obj.ax + +def boxplot(data, column=None, by=None, ax=None, fontsize=None, + rot=0, grid=True, figsize=None, **kwds): + """ + Make a box plot from DataFrame column optionally grouped b ysome columns or + other inputs + + Parameters + ---------- + data : DataFrame or Series + column : column name or list of names, or vector + Can be any valid input to groupby + by : string or sequence + Column in the DataFrame to group by + fontsize : int or string + rot : label rotation angle + kwds : other plotting keyword arguments to be passed to matplotlib boxplot + function + + Returns + ------- + ax : matplotlib.axes.AxesSubplot + """ + from pandas import Series, DataFrame + if isinstance(data, Series): + data = DataFrame({'x' : data}) + column = 'x' + + def plot_group(grouped, ax): + keys, values = zip(*grouped) + keys = [_stringify(x) for x in keys] + values = [remove_na(v) for v in values] + ax.boxplot(values, **kwds) + if kwds.get('vert', 1): + ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) + else: + ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize) + + if column == None: + columns = None + else: + if isinstance(column, (list, tuple)): + columns = column + else: + columns = [column] + + if by is not None: + if not isinstance(by, (list, tuple)): + by = [by] + + fig, axes = _grouped_plot_by_column(plot_group, data, columns=columns, + by=by, grid=grid, figsize=figsize) + + # Return axes in multiplot case, maybe revisit later # 985 + ret = axes + else: + if ax is None: + ax = _gca() + fig = ax.get_figure() + data = data._get_numeric_data() + if columns: + cols = columns + else: + cols = data.columns + keys = [_stringify(x) for x in cols] + + # Return boxplot dict in single plot case + + clean_values = [remove_na(x) for x in data[cols].values.T] + bp = ax.boxplot(clean_values, **kwds) + if kwds.get('vert', 1): + ax.set_xticklabels(keys, rotation=rot, fontsize=fontsize) + else: + ax.set_yticklabels(keys, rotation=rot, fontsize=fontsize) + ax.grid(grid) + + ret = bp + + fig.subplots_adjust(bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2) + return ret + + +def _stringify(x): + if isinstance(x, tuple): + return '|'.join(str(y) for y in x) + else: + return str(x) + + +def format_date_labels(ax): + # mini version of autofmt_xdate + try: + for label in ax.get_xticklabels(): + label.set_ha('right') + label.set_rotation(30) + fig = ax.get_figure() + fig.subplots_adjust(bottom=0.2) + except Exception: # pragma: no cover + pass + + +def scatter_plot(data, x, y, by=None, ax=None, figsize=None, grid=False): + """ + + Returns + ------- + fig : matplotlib.Figure + """ + import matplotlib.pyplot as plt + + def plot_group(group, ax): + xvals = group[x].values + yvals = group[y].values + ax.scatter(xvals, yvals) + ax.grid(grid) + + if by is not None: + fig = _grouped_plot(plot_group, data, by=by, figsize=figsize, ax=ax) + else: + if ax is None: + fig = plt.figure() + ax = fig.add_subplot(111) + else: + fig = ax.get_figure() + plot_group(data, ax) + ax.set_ylabel(str(y)) + ax.set_xlabel(str(x)) + + ax.grid(grid) + + return fig + + +def hist_frame(data, grid=True, xlabelsize=None, xrot=None, + ylabelsize=None, yrot=None, ax=None, + sharex=False, sharey=False, **kwds): + """ + Draw Histogram the DataFrame's series using matplotlib / pylab. + + Parameters + ---------- + grid : boolean, default True + Whether to show axis grid lines + xlabelsize : int, default None + If specified changes the x-axis label size + xrot : float, default None + rotation of x axis labels + ylabelsize : int, default None + If specified changes the y-axis label size + yrot : float, default None + rotation of y axis labels + ax : matplotlib axes object, default None + sharex : bool, if True, the X axis will be shared amongst all subplots. + sharey : bool, if True, the Y axis will be shared amongst all subplots. + kwds : other plotting keyword arguments + To be passed to hist function + """ + import matplotlib.pyplot as plt + n = len(data.columns) + rows, cols = 1, 1 + while rows * cols < n: + if cols > rows: + rows += 1 + else: + cols += 1 + _, axes = _subplots(nrows=rows, ncols=cols, ax=ax, squeeze=False, + sharex=sharex, sharey=sharey) + + for i, col in enumerate(com._try_sort(data.columns)): + ax = axes[i / cols][i % cols] + ax.xaxis.set_visible(True) + ax.yaxis.set_visible(True) + ax.hist(data[col].dropna().values, **kwds) + ax.set_title(col) + ax.grid(grid) + + if xlabelsize is not None: + plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + if xrot is not None: + plt.setp(ax.get_xticklabels(), rotation=xrot) + if ylabelsize is not None: + plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + if yrot is not None: + plt.setp(ax.get_yticklabels(), rotation=yrot) + + for j in range(i + 1, rows * cols): + ax = axes[j / cols, j % cols] + ax.set_visible(False) + + ax.get_figure().subplots_adjust(wspace=0.3, hspace=0.3) + + return axes + +def hist_series(self, ax=None, grid=True, xlabelsize=None, xrot=None, + ylabelsize=None, yrot=None, **kwds): + """ + Draw histogram of the input series using matplotlib + + Parameters + ---------- + ax : matplotlib axis object + If not passed, uses gca() + grid : boolean, default True + Whether to show axis grid lines + xlabelsize : int, default None + If specified changes the x-axis label size + xrot : float, default None + rotation of x axis labels + ylabelsize : int, default None + If specified changes the y-axis label size + yrot : float, default None + rotation of y axis labels + kwds : keywords + To be passed to the actual plotting function + + Notes + ----- + See matplotlib documentation online for more on this + + """ + import matplotlib.pyplot as plt + + if ax is None: + ax = plt.gca() + + values = self.dropna().values + + ax.hist(values, **kwds) + ax.grid(grid) + + if xlabelsize is not None: + plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) + if xrot is not None: + plt.setp(ax.get_xticklabels(), rotation=xrot) + if ylabelsize is not None: + plt.setp(ax.get_yticklabels(), fontsize=ylabelsize) + if yrot is not None: + plt.setp(ax.get_yticklabels(), rotation=yrot) + + return ax + +def boxplot_frame_groupby(grouped, subplots=True, column=None, fontsize=None, + rot=0, grid=True, figsize=None, **kwds): + """ + Make box plots from DataFrameGroupBy data. + + Parameters + ---------- + subplots : + * ``False`` - no subplots will be used + * ``True`` - create a subplot for each group + column : column name or list of names, or vector + Can be any valid input to groupby + fontsize : int or string + rot : label rotation angle + kwds : other plotting keyword arguments to be passed to matplotlib boxplot + function + + Returns + ------- + dict of key/value = group key/DataFrame.boxplot return value + or DataFrame.boxplot return value in case subplots=figures=False + + Examples + -------- + >>> import pandas + >>> import numpy as np + >>> import itertools + >>> + >>> tuples = [t for t in itertools.product(range(1000), range(4))] + >>> index = pandas.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> data = np.random.randn(len(index),4) + >>> df = pandas.DataFrame(data, columns=list('ABCD'), index=index) + >>> + >>> grouped = df.groupby(level='lvl1') + >>> boxplot_frame_groupby(grouped) + >>> + >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) + >>> boxplot_frame_groupby(grouped, subplots=False) + """ + if subplots is True: + nrows, ncols = _get_layout(len(grouped)) + _, axes = _subplots(nrows=nrows, ncols=ncols, squeeze=False, + sharex=False, sharey=True) + axes = axes.reshape(-1) if len(grouped) > 1 else axes + + ret = {} + for (key, group), ax in zip(grouped, axes): + d = group.boxplot(ax=ax, column=column, fontsize=fontsize, + rot=rot, grid=grid, figsize=figsize, **kwds) + ax.set_title(_stringify(key)) + ret[key] = d + else: + from pandas.tools.merge import concat + keys, frames = zip(*grouped) + if grouped.axis == 0: + df = concat(frames, keys=keys, axis=1) + else: + if len(frames) > 1: + df = frames[0].join(frames[1::]) + else: + df = frames[0] + ret = df.boxplot(column=column, fontsize=fontsize, rot=rot, + grid=grid, figsize=figsize, **kwds) + return ret + +def _grouped_plot(plotf, data, column=None, by=None, numeric_only=True, + figsize=None, sharex=True, sharey=True, layout=None, + rot=0, ax=None): + from pandas.core.frame import DataFrame + + # allow to specify mpl default with 'default' + if figsize is None or figsize == 'default': + figsize = (10, 5) # our default + + grouped = data.groupby(by) + if column is not None: + grouped = grouped[column] + + ngroups = len(grouped) + + nrows, ncols = layout or _get_layout(ngroups) + + if figsize is None: + # our favorite default beating matplotlib's idea of the + # default size + figsize = (10, 5) + fig, axes = _subplots(nrows=nrows, ncols=ncols, figsize=figsize, + sharex=sharex, sharey=sharey, ax=ax) + + ravel_axes = [] + for row in axes: + ravel_axes.extend(row) + + for i, (key, group) in enumerate(grouped): + ax = ravel_axes[i] + if numeric_only and isinstance(group, DataFrame): + group = group._get_numeric_data() + plotf(group, ax) + ax.set_title(str(key)) + + return fig, axes + +def _grouped_plot_by_column(plotf, data, columns=None, by=None, + numeric_only=True, grid=False, + figsize=None, ax=None): + import matplotlib.pyplot as plt + + grouped = data.groupby(by) + if columns is None: + columns = data._get_numeric_data().columns - by + ngroups = len(columns) + + nrows, ncols = _get_layout(ngroups) + fig, axes = _subplots(nrows=nrows, ncols=ncols, + sharex=True, sharey=True, + figsize=figsize, ax=ax) + + if isinstance(axes, plt.Axes): + ravel_axes = [axes] + else: + ravel_axes = [] + for row in axes: + if isinstance(row, plt.Axes): + ravel_axes.append(row) + else: + ravel_axes.extend(row) + + for i, col in enumerate(columns): + ax = ravel_axes[i] + gp_col = grouped[col] + plotf(gp_col, ax) + ax.set_title(col) + ax.set_xlabel(str(by)) + ax.grid(grid) + + byline = by[0] if len(by) == 1 else by + fig.suptitle('Boxplot grouped by %s' % byline) + + return fig, axes + +def _get_layout(nplots): + if nplots == 1: + return (1, 1) + elif nplots == 2: + return (1, 2) + elif nplots < 4: + return (2, 2) + + k = 1 + while k ** 2 < nplots: + k += 1 + + if (k - 1) * k >= nplots: + return k, (k - 1) + else: + return k, k + +# copied from matplotlib/pyplot.py for compatibility with matplotlib < 1.0 + +def _subplots(nrows=1, ncols=1, sharex=False, sharey=False, squeeze=True, + subplot_kw=None, ax=None, secondary_y=False, data=None, + **fig_kw): + """Create a figure with a set of subplots already made. + + This utility wrapper makes it convenient to create common layouts of + subplots, including the enclosing figure object, in a single call. + + Keyword arguments: + + nrows : int + Number of rows of the subplot grid. Defaults to 1. + + ncols : int + Number of columns of the subplot grid. Defaults to 1. + + sharex : bool + If True, the X axis will be shared amongst all subplots. + + sharey : bool + If True, the Y axis will be shared amongst all subplots. + + squeeze : bool + + If True, extra dimensions are squeezed out from the returned axis object: + - if only one subplot is constructed (nrows=ncols=1), the resulting + single Axis object is returned as a scalar. + - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object + array of Axis objects are returned as numpy 1-d arrays. + - for NxM subplots with N>1 and M>1 are returned as a 2d array. + + If False, no squeezing at all is done: the returned axis object is always + a 2-d array contaning Axis instances, even if it ends up being 1x1. + + subplot_kw : dict + Dict with keywords passed to the add_subplot() call used to create each + subplots. + + fig_kw : dict + Dict with keywords passed to the figure() call. Note that all keywords + not recognized above will be automatically included here. + + ax : Matplotlib axis object, default None + + secondary_y : boolean or sequence of ints, default False + If True then y-axis will be on the right + + Returns: + + fig, ax : tuple + - fig is the Matplotlib Figure object + - ax can be either a single axis object or an array of axis objects if + more than one supblot was created. The dimensions of the resulting array + can be controlled with the squeeze keyword, see above. + + **Examples:** + + x = np.linspace(0, 2*np.pi, 400) + y = np.sin(x**2) + + # Just a figure and one subplot + f, ax = plt.subplots() + ax.plot(x, y) + ax.set_title('Simple plot') + + # Two subplots, unpack the output array immediately + f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) + ax1.plot(x, y) + ax1.set_title('Sharing Y axis') + ax2.scatter(x, y) + + # Four polar axes + plt.subplots(2, 2, subplot_kw=dict(polar=True)) + """ + import matplotlib.pyplot as plt + from pandas.core.frame import DataFrame + + if subplot_kw is None: + subplot_kw = {} + + if ax is None: + fig = plt.figure(**fig_kw) + else: + fig = ax.get_figure() + fig.clear() + + # Create empty object array to hold all axes. It's easiest to make it 1-d + # so we can just append subplots upon creation, and then + nplots = nrows*ncols + axarr = np.empty(nplots, dtype=object) + + def on_right(i): + if isinstance(secondary_y, bool): + return secondary_y + if isinstance(data, DataFrame): + return data.columns[i] in secondary_y + + # Create first subplot separately, so we can share it if requested + ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) + if on_right(0): + orig_ax = ax0 + ax0 = ax0.twinx() + orig_ax.get_yaxis().set_visible(False) + + if sharex: + subplot_kw['sharex'] = ax0 + if sharey: + subplot_kw['sharey'] = ax0 + axarr[0] = ax0 + + # Note off-by-one counting because add_subplot uses the MATLAB 1-based + # convention. + for i in range(1, nplots): + ax = fig.add_subplot(nrows, ncols, i+1, **subplot_kw) + if on_right(i): + orig_ax = ax + ax = ax.twinx() + orig_ax.get_yaxis().set_visible(False) + axarr[i] = ax + + if nplots > 1: + if sharex and nrows > 1: + for i, ax in enumerate(axarr): + if np.ceil(float(i + 1) / ncols) < nrows: # only last row + [label.set_visible(False) for label in ax.get_xticklabels()] + if sharey and ncols > 1: + for i, ax in enumerate(axarr): + if (i % ncols) != 0: # only first column + [label.set_visible(False) for label in ax.get_yticklabels()] + + if squeeze: + # Reshape the array to have the final desired dimension (nrow,ncol), + # though discarding unneeded dimensions that equal 1. If we only have + # one subplot, just return it instead of a 1-element array. + if nplots==1: + axes = axarr[0] + else: + axes = axarr.reshape(nrows, ncols).squeeze() + else: + # returned axis array will be always 2-d, even if nrows=ncols=1 + axes = axarr.reshape(nrows, ncols) + + return fig, axes + +if __name__ == '__main__': + # import pandas.rpy.common as com + # sales = com.load_data('sanfrancisco.home.sales', package='nutshell') + # top10 = sales['zip'].value_counts()[:10].index + # sales2 = sales[sales.zip.isin(top10)] + # _ = scatter_plot(sales2, 'squarefeet', 'price', by='zip') + + # plt.show() + + import matplotlib.pyplot as plt + + import pandas.tools.plotting as plots + import pandas.core.frame as fr + reload(plots) + reload(fr) + from pandas.core.frame import DataFrame + + data = DataFrame([[3, 6, -5], [4, 8, 2], [4, 9, -6], + [4, 9, -3], [2, 5, -1]], + columns=['A', 'B', 'C']) + data.plot(kind='barh', stacked=True) + + plt.show() diff --git a/pandas/tools/tests/__init__.py b/pandas/tools/tests/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/pandas/tools/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/pandas/tools/tests/test_merge.py b/pandas/tools/tests/test_merge.py new file mode 100644 index 00000000..6be27a0a --- /dev/null +++ b/pandas/tools/tests/test_merge.py @@ -0,0 +1,1434 @@ +# pylint: disable=E1103 + +import nose +import unittest + +from datetime import datetime +from numpy.random import randn +from numpy import nan +import numpy as np +import random + +from pandas import * +from pandas.tseries.index import DatetimeIndex +from pandas.tools.merge import merge, concat, ordered_merge, MergeError +from pandas.util.testing import (assert_frame_equal, assert_series_equal, + assert_almost_equal, rands) +import pandas.lib as lib +import pandas.util.testing as tm + +a_ = np.array + +N = 50 +NGROUPS = 8 +JOIN_TYPES = ['inner', 'outer', 'left', 'right'] + + +def get_test_data(ngroups=NGROUPS, n=N): + unique_groups = range(ngroups) + arr = np.asarray(np.tile(unique_groups, n // ngroups)) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)]) + + random.shuffle(arr) + return arr + +class TestMerge(unittest.TestCase): + + def setUp(self): + # aggregate multiple columns + self.df = DataFrame({'key1': get_test_data(), + 'key2': get_test_data(), + 'data1': np.random.randn(N), + 'data2': np.random.randn(N)}) + + # exclude a couple keys for fun + self.df = self.df[self.df['key2'] > 1] + + self.df2 = DataFrame({'key1' : get_test_data(n=N//5), + 'key2' : get_test_data(ngroups=NGROUPS//2, + n=N//5), + 'value': np.random.randn(N // 5)}) + + index, data = tm.getMixedTypeDict() + self.target = DataFrame(data, index=index) + + # Join on string value + self.source = DataFrame({'MergedA': data['A'], 'MergedD': data['D']}, + index=data['C']) + + self.left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], + 'v1': np.random.randn(7)}) + self.right = DataFrame({'v2': np.random.randn(4)}, + index=['d', 'b', 'c', 'a']) + + def test_cython_left_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + ls, rs = lib.left_outer_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8, 9, 10]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5, -1, -1]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_(np.array_equal(ls, exp_ls)) + self.assert_(np.array_equal(rs, exp_rs)) + + def test_cython_right_outer_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) + max_group = 5 + + rs, ls = lib.left_outer_join(right, left, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + # 0 1 1 1 + exp_li = a_([0, 1, 2, 3, 4, 5, 3, 4, 5, 3, 4, 5, + # 2 2 4 + 6, 7, 8, 6, 7, 8, -1]) + exp_ri = a_([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, + 4, 4, 4, 5, 5, 5, 6]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_(np.array_equal(ls, exp_ls)) + self.assert_(np.array_equal(rs, exp_rs)) + + def test_cython_inner_join(self): + left = a_([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.int64) + right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) + max_group = 5 + + ls, rs = lib.inner_join(left, right, max_group) + + exp_ls = left.argsort(kind='mergesort') + exp_rs = right.argsort(kind='mergesort') + + exp_li = a_([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 7, 7, 8, 8]) + exp_ri = a_([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, + 4, 5, 4, 5, 4, 5]) + + exp_ls = exp_ls.take(exp_li) + exp_ls[exp_li == -1] = -1 + + exp_rs = exp_rs.take(exp_ri) + exp_rs[exp_ri == -1] = -1 + + self.assert_(np.array_equal(ls, exp_ls)) + self.assert_(np.array_equal(rs, exp_rs)) + + def test_left_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='left') + + joined_both = merge(self.df, self.df2) + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='left') + + def test_right_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='right') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='right') + + joined_both = merge(self.df, self.df2, how='right') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='right') + + def test_full_outer_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='outer') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='outer') + + joined_both = merge(self.df, self.df2, how='outer') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='outer') + + def test_inner_join(self): + joined_key2 = merge(self.df, self.df2, on='key2', how='inner') + _check_join(self.df, self.df2, joined_key2, ['key2'], how='inner') + + joined_both = merge(self.df, self.df2, how='inner') + _check_join(self.df, self.df2, joined_both, ['key1', 'key2'], + how='inner') + + def test_handle_overlap(self): + joined = merge(self.df, self.df2, on='key2', + suffixes=['.foo', '.bar']) + + self.assert_('key1.foo' in joined) + self.assert_('key1.bar' in joined) + + def test_handle_overlap_arbitrary_key(self): + joined = merge(self.df, self.df2, + left_on='key2', right_on='key1', + suffixes=['.foo', '.bar']) + self.assert_('key1.foo' in joined) + self.assert_('key2.bar' in joined) + + def test_merge_common(self): + joined = merge(self.df, self.df2) + exp = merge(self.df, self.df2, on=['key1', 'key2']) + tm.assert_frame_equal(joined, exp) + + def test_join_on(self): + target = self.target + source = self.source + + merged = target.join(source, on='C') + self.assert_(np.array_equal(merged['MergedA'], target['A'])) + self.assert_(np.array_equal(merged['MergedD'], target['D'])) + + # join with duplicates (fix regression from DataFrame/Matrix merge) + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + joined = df.join(df2, on='key') + expected = DataFrame({'key': ['a', 'a', 'b', 'b', 'c'], + 'value': [0, 0, 1, 1, 2]}) + assert_frame_equal(joined, expected) + + # Test when some are missing + df_a = DataFrame([[1], [2], [3]], index=['a', 'b', 'c'], + columns=['one']) + df_b = DataFrame([['foo'], ['bar']], index=[1, 2], + columns=['two']) + df_c = DataFrame([[1], [2]], index=[1, 2], + columns=['three']) + joined = df_a.join(df_b, on='one') + joined = joined.join(df_c, on='one') + self.assert_(np.isnan(joined['two']['c'])) + self.assert_(np.isnan(joined['three']['c'])) + + # merge column not p resent + self.assertRaises(Exception, target.join, source, on='E') + + # overlap + source_copy = source.copy() + source_copy['A'] = 0 + self.assertRaises(Exception, target.join, source_copy, on='A') + + def test_join_on_pass_vector(self): + expected = self.target.join(self.source, on='C') + del expected['C'] + + join_col = self.target.pop('C') + result = self.target.join(self.source, on=join_col) + assert_frame_equal(result, expected) + + def test_join_with_len0(self): + # nothing to merge + merged = self.target.join(self.source.reindex([]), on='C') + for col in self.source: + self.assert_(col in merged) + self.assert_(merged[col].isnull().all()) + + merged2 = self.target.join(self.source.reindex([]), on='C', + how='inner') + self.assert_(merged2.columns.equals(merged.columns)) + self.assertEqual(len(merged2), 0) + + def test_join_on_inner(self): + df = DataFrame({'key': ['a', 'a', 'd', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1]}, index=['a', 'b']) + + joined = df.join(df2, on='key', how='inner') + + expected = df.join(df2, on='key') + expected = expected[expected['value'].notnull()] + self.assert_(np.array_equal(joined['key'], expected['key'])) + self.assert_(np.array_equal(joined['value'], expected['value'])) + self.assert_(joined.index.equals(expected.index)) + + def test_join_on_singlekey_list(self): + df = DataFrame({'key': ['a', 'a', 'b', 'b', 'c']}) + df2 = DataFrame({'value': [0, 1, 2]}, index=['a', 'b', 'c']) + + # corner cases + joined = df.join(df2, on=['key']) + expected = df.join(df2, on='key') + + assert_frame_equal(joined, expected) + + def test_join_on_series(self): + result = self.target.join(self.source['MergedA'], on='C') + expected = self.target.join(self.source[['MergedA']], on='C') + assert_frame_equal(result, expected) + + def test_join_on_series_buglet(self): + # GH #638 + df = DataFrame({'a': [1, 1]}) + ds = Series([2], index=[1], name='b') + result = df.join(ds, on='a') + expected = DataFrame({'a' : [1, 1], + 'b': [2, 2]}, index=df.index) + tm.assert_frame_equal(result, expected) + + def test_join_index_mixed(self): + df1 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(10), + columns=['A', 'B', 'C', 'D']) + self.assert_(df1['B'].dtype == np.int64) + self.assert_(df1['D'].dtype == np.bool_) + + df2 = DataFrame({'A': 1., 'B': 2, 'C': 'foo', 'D': True}, + index=np.arange(0, 10, 2), + columns=['A', 'B', 'C', 'D']) + + # overlap + joined = df1.join(df2, lsuffix='_one', rsuffix='_two') + expected_columns = ['A_one', 'B_one', 'C_one', 'D_one', + 'A_two', 'B_two', 'C_two', 'D_two'] + df1.columns = expected_columns[:4] + df2.columns = expected_columns[4:] + expected = _join_by_hand(df1, df2) + assert_frame_equal(joined, expected) + + # no overlapping blocks + df1 = DataFrame(index=np.arange(10)) + df1['bool'] = True + df1['string'] = 'foo' + + df2 = DataFrame(index=np.arange(5, 15)) + df2['int'] = 1 + df2['float'] = 1. + + for kind in JOIN_TYPES: + joined = df1.join(df2, how=kind) + expected = _join_by_hand(df1, df2, how=kind) + assert_frame_equal(joined, expected) + + joined = df2.join(df1, how=kind) + expected = _join_by_hand(df2, df1, how=kind) + assert_frame_equal(joined, expected) + + def test_join_empty_bug(self): + # generated an exception in 0.4.3 + x = DataFrame() + x.join(DataFrame([3], index=[0], columns=['A']), how='outer') + + def test_join_unconsolidated(self): + # GH #331 + a = DataFrame(randn(30,2), columns=['a','b']) + c = Series(randn(30)) + a['c'] = c + d = DataFrame(randn(30,1), columns=['q']) + + # it works! + a.join(d) + d.join(a) + + def test_join_multiindex(self): + index1 = MultiIndex.from_arrays([['a','a','a','b','b','b'], + [1,2,3,1,2,3]], + names=['first', 'second']) + + index2 = MultiIndex.from_arrays([['b','b','b','c','c','c'], + [1,2,3,1,2,3]], + names=['first', 'second']) + + df1 = DataFrame(data=np.random.randn(6), index=index1, + columns=['var X']) + df2 = DataFrame(data=np.random.randn(6), index=index2, + columns=['var Y']) + + df1 = df1.sortlevel(0) + df2 = df2.sortlevel(0) + + joined = df1.join(df2, how='outer') + ex_index = index1._tuple_index + index2._tuple_index + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + df1 = df1.sortlevel(1) + df2 = df2.sortlevel(1) + + joined = df1.join(df2, how='outer').sortlevel(0) + ex_index = index1._tuple_index + index2._tuple_index + expected = df1.reindex(ex_index).join(df2.reindex(ex_index)) + + assert_frame_equal(joined, expected) + self.assertEqual(joined.index.names, index1.names) + + def test_join_inner_multiindex(self): + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + data = DataFrame({'key1': key1, 'key2': key2, + 'data': data}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + to_join = DataFrame(np.random.randn(10, 3), index=index, + columns=['j_one', 'j_two', 'j_three']) + + joined = data.join(to_join, on=['key1', 'key2'], how='inner') + expected = merge(data, to_join.reset_index(), + left_on=['key1', 'key2'], + right_on=['first', 'second'], how='inner', + sort=False) + + expected2 = merge(to_join, data, + right_on=['key1', 'key2'], left_index=True, + how='inner', sort=False) + assert_frame_equal(joined, expected2.reindex_like(joined)) + + expected2 = merge(to_join, data, right_on=['key1', 'key2'], + left_index=True, how='inner', sort=False) + + expected = expected.drop(['first', 'second'], axis=1) + expected.index = joined.index + + self.assert_(joined.index.is_monotonic) + assert_frame_equal(joined, expected) + + # _assert_same_contents(expected, expected2.ix[:, expected.columns]) + + def test_join_float64_float32(self): + a = DataFrame(randn(10,2), columns=['a','b']) + b = DataFrame(randn(10,1), columns=['c']).astype(np.float32) + joined = a.join(b) + expected = a.join(b.astype('f8')) + assert_frame_equal(joined, expected) + + def test_join_many_non_unique_index(self): + df1 = DataFrame({"a": [1,1], "b": [1,1], "c": [10,20]}) + df2 = DataFrame({"a": [1,1], "b": [1,2], "d": [100,200]}) + df3 = DataFrame({"a": [1,1], "b": [1,2], "e": [1000,2000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + + result = idf1.join([idf2, idf3], how='outer') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='outer') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='outer') + + result = result.reset_index() + + result['a'] = result['a'].astype(np.float64) + result['b'] = result['b'].astype(np.float64) + + assert_frame_equal(result, expected.ix[:, result.columns]) + + df1 = DataFrame({"a": [1, 1, 1], "b": [1,1, 1], "c": [10,20, 30]}) + df2 = DataFrame({"a": [1, 1, 1], "b": [1,1, 2], "d": [100,200, 300]}) + df3 = DataFrame({"a": [1, 1, 1], "b": [1,1, 2], "e": [1000,2000, 3000]}) + idf1 = df1.set_index(["a", "b"]) + idf2 = df2.set_index(["a", "b"]) + idf3 = df3.set_index(["a", "b"]) + result = idf1.join([idf2, idf3], how='inner') + + df_partially_merged = merge(df1, df2, on=['a', 'b'], how='inner') + expected = merge(df_partially_merged, df3, on=['a', 'b'], how='inner') + + result = result.reset_index() + + assert_frame_equal(result, expected.ix[:, result.columns]) + + def test_merge_index_singlekey_right_vs_left(self): + left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], + 'v1': np.random.randn(7)}) + right = DataFrame({'v2': np.random.randn(4)}, + index=['d', 'b', 'c', 'a']) + + merged1 = merge(left, right, left_on='key', + right_index=True, how='left', sort=False) + merged2 = merge(right, left, right_on='key', + left_index=True, how='right', sort=False) + assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) + + merged1 = merge(left, right, left_on='key', + right_index=True, how='left', sort=True) + merged2 = merge(right, left, right_on='key', + left_index=True, how='right', sort=True) + assert_frame_equal(merged1, merged2.ix[:, merged1.columns]) + + def test_merge_index_singlekey_inner(self): + left = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'e', 'a'], + 'v1': np.random.randn(7)}) + right = DataFrame({'v2': np.random.randn(4)}, + index=['d', 'b', 'c', 'a']) + + # inner join + result = merge(left, right, left_on='key', right_index=True, + how='inner') + expected = left.join(right, on='key').ix[result.index] + assert_frame_equal(result, expected) + + result = merge(right, left, right_on='key', left_index=True, + how='inner') + expected = left.join(right, on='key').ix[result.index] + assert_frame_equal(result, expected.ix[:, result.columns]) + + def test_merge_misspecified(self): + self.assertRaises(Exception, merge, self.left, self.right, + left_index=True) + self.assertRaises(Exception, merge, self.left, self.right, + right_index=True) + + self.assertRaises(Exception, merge, self.left, self.left, + left_on='key', on='key') + + self.assertRaises(Exception, merge, self.df, self.df2, + left_on=['key1'], right_on=['key1', 'key2']) + + def test_merge_overlap(self): + merged = merge(self.left, self.left, on='key') + exp_len = (self.left['key'].value_counts() ** 2).sum() + self.assertEqual(len(merged), exp_len) + self.assert_('v1_x' in merged) + self.assert_('v1_y' in merged) + + def test_merge_different_column_key_names(self): + left = DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], + 'value': [1, 2, 3, 4]}) + right = DataFrame({'rkey': ['foo', 'bar', 'qux', 'foo'], + 'value' : [5, 6, 7, 8]}) + + merged = left.merge(right, left_on='lkey', right_on='rkey', + how='outer') + + assert_almost_equal(merged['lkey'], + ['bar', 'baz', 'foo', 'foo', 'foo', 'foo', np.nan]) + assert_almost_equal(merged['rkey'], + ['bar', np.nan, 'foo', 'foo', 'foo', 'foo', 'qux']) + assert_almost_equal(merged['value_x'], [2, 3, 1, 1, 4, 4, np.nan]) + assert_almost_equal(merged['value_y'], [6, np.nan, 5, 8, 5, 8, 7]) + + def test_merge_nocopy(self): + left = DataFrame({'a' : 0, 'b' : 1}, index=range(10)) + right = DataFrame({'c' : 'foo', 'd' : 'bar'}, index=range(10)) + + merged = merge(left, right, left_index=True, + right_index=True, copy=False) + + merged['a'] = 6 + self.assert_((left['a'] == 6).all()) + + merged['d'] = 'peekaboo' + self.assert_((right['d'] == 'peekaboo').all()) + + def test_join_sort(self): + left = DataFrame({'key' : ['foo', 'bar', 'baz', 'foo'], + 'value' : [1, 2, 3, 4]}) + right = DataFrame({'value2' : ['a', 'b', 'c']}, + index=['bar', 'baz', 'foo']) + + joined = left.join(right, on='key', sort=True) + expected = DataFrame({'key' : ['bar', 'baz', 'foo', 'foo'], + 'value' : [2, 3, 1, 4], + 'value2' : ['a', 'b', 'c', 'c']}, + index=[1, 2, 0, 3]) + assert_frame_equal(joined, expected) + + # smoke test + joined = left.join(right, on='key', sort=False) + self.assert_(np.array_equal(joined.index, range(4))) + + def test_intelligently_handle_join_key(self): + # #733, be a bit more 1337 about not returning unconsolidated DataFrame + + left = DataFrame({'key' : [1, 1, 2, 2, 3], + 'value' : range(5)}, columns=['value', 'key']) + right = DataFrame({'key' : [1, 1, 2, 3, 4, 5], + 'rvalue' : range(6)}) + + joined = merge(left, right, on='key', how='outer') + expected = DataFrame({'key' : [1, 1, 1, 1, 2, 2, 3, 4, 5.], + 'value' : np.array([0, 0, 1, 1, 2, 3, 4, + np.nan, np.nan]), + 'rvalue' : np.array([0, 1, 0, 1, 2, 2, 3, 4, 5])}, + columns=['value', 'key', 'rvalue']) + assert_frame_equal(joined, expected) + + self.assert_(joined._data.is_consolidated()) + + def test_handle_join_key_pass_array(self): + left = DataFrame({'key' : [1, 1, 2, 2, 3], + 'value' : range(5)}, columns=['value', 'key']) + right = DataFrame({'rvalue' : range(6)}) + key = np.array([1, 1, 2, 3, 4, 5]) + + merged = merge(left, right, left_on='key', right_on=key, how='outer') + merged2 = merge(right, left, left_on=key, right_on='key', how='outer') + + assert_series_equal(merged['key'], merged2['key']) + self.assert_(merged['key'].notnull().all()) + self.assert_(merged2['key'].notnull().all()) + + left = DataFrame({'value' : range(5)}, columns=['value']) + right = DataFrame({'rvalue' : range(6)}) + lkey = np.array([1, 1, 2, 2, 3]) + rkey = np.array([1, 1, 2, 3, 4, 5]) + + merged = merge(left, right, left_on=lkey, right_on=rkey, how='outer') + self.assert_(np.array_equal(merged['key_0'], + np.array([1, 1, 1, 1, 2, 2, 3, 4, 5]))) + + left = DataFrame({'value': range(3)}) + right = DataFrame({'rvalue' : range(6)}) + + key = np.array([0, 1, 1, 2, 2, 3]) + merged = merge(left, right, left_index=True, right_on=key, how='outer') + self.assert_(np.array_equal(merged['key_0'], key)) + + def test_mixed_type_join_with_suffix(self): + # GH #916 + df = DataFrame(np.random.randn(20, 6), + columns=['a', 'b', 'c', 'd', 'e', 'f']) + df.insert(0, 'id', 0) + df.insert(5, 'dt', 'foo') + + grouped = df.groupby('id') + mn = grouped.mean() + cn = grouped.count() + + # it works! + mn.join(cn, rsuffix='_right') + + def test_no_overlap_more_informative_error(self): + dt = datetime.now() + df1 = DataFrame({'x': ['a']}, index=[dt]) + + df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + self.assertRaises(MergeError, merge, df1, df2) + + def test_merge_non_unique_indexes(self): + + dt = datetime(2012, 5, 1) + dt2 = datetime(2012, 5, 2) + dt3 = datetime(2012, 5, 3) + dt4 = datetime(2012, 5, 4) + + df1 = DataFrame({'x': ['a']}, index=[dt]) + df2 = DataFrame({'y': ['b', 'c']}, index=[dt, dt]) + _check_merge(df1, df2) + + # Not monotonic + df1 = DataFrame({'x': ['a', 'b', 'q']}, index=[dt2, dt, dt4]) + df2 = DataFrame({'y': ['c', 'd', 'e', 'f', 'g', 'h']}, + index=[dt3, dt3, dt2, dt2, dt, dt]) + _check_merge(df1, df2) + + df1 = DataFrame({'x': ['a', 'b']}, index=[dt, dt]) + df2 = DataFrame({'y': ['c', 'd']}, index=[dt, dt]) + _check_merge(df1, df2) + + def test_merge_non_unique_index_many_to_many(self): + dt = datetime(2012, 5, 1) + dt2 = datetime(2012, 5, 2) + dt3 = datetime(2012, 5, 3) + df1 = DataFrame({'x': ['a', 'b', 'c', 'd']}, + index=[dt2, dt2, dt, dt]) + df2 = DataFrame({'y': ['e', 'f', 'g',' h', 'i']}, + index=[dt2, dt2, dt3, dt, dt]) + _check_merge(df1, df2) + +def _check_merge(x, y): + for how in ['inner', 'left', 'outer']: + result = x.join(y, how=how) + + expected = merge(x.reset_index(), y.reset_index(), how=how) + expected = expected.set_index('index') + + assert_frame_equal(result, expected) + +class TestMergeMulti(unittest.TestCase): + + def setUp(self): + self.index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.to_join = DataFrame(np.random.randn(10, 3), index=self.index, + columns=['j_one', 'j_two', 'j_three']) + + # a little relevant example with NAs + key1 = ['bar', 'bar', 'bar', 'foo', 'foo', 'baz', 'baz', 'qux', + 'qux', 'snap'] + key2 = ['two', 'one', 'three', 'one', 'two', 'one', 'two', 'two', + 'three', 'one'] + + data = np.random.randn(len(key1)) + self.data = DataFrame({'key1' : key1, 'key2' : key2, + 'data' : data}) + + def test_merge_on_multikey(self): + joined = self.data.join(self.to_join, on=['key1', 'key2']) + + join_key = Index(zip(self.data['key1'], self.data['key2'])) + indexer = self.to_join.index.get_indexer(join_key) + ex_values = self.to_join.values.take(indexer, axis=0) + ex_values[indexer == -1] = np.nan + expected = self.data.join(DataFrame(ex_values, + columns=self.to_join.columns)) + + # TODO: columns aren't in the same order yet + assert_frame_equal(joined, expected.ix[:, joined.columns]) + + def test_merge_right_vs_left(self): + # compare left vs right merge with multikey + merged1 = self.data.merge(self.to_join, left_on=['key1', 'key2'], + right_index=True, how='left') + merged2 = self.to_join.merge(self.data, right_on=['key1', 'key2'], + left_index=True, how='right') + merged2 = merged2.ix[:, merged1.columns] + assert_frame_equal(merged1, merged2) + + def test_compress_group_combinations(self): + + # ~ 40000000 possible unique groups + key1 = np.array([rands(10) for _ in xrange(10000)], dtype='O') + key1 = np.tile(key1, 2) + key2 = key1[::-1] + + df = DataFrame({'key1' : key1, 'key2' : key2, + 'value1' : np.random.randn(20000)}) + + df2 = DataFrame({'key1' : key1[::2], 'key2' : key2[::2], + 'value2' : np.random.randn(10000)}) + + # just to hit the label compression code path + merged = merge(df, df2, how='outer') + + def test_left_join_index_preserve_order(self): + + left = DataFrame({'k1' : [0, 1, 2] * 8, + 'k2' : ['foo', 'bar'] * 12, + 'v' : np.arange(24)}) + + index = MultiIndex.from_tuples([(2, 'bar'), (1, 'foo')]) + right = DataFrame({'v2' : [5, 7]}, index=index) + + result = left.join(right, on=['k1', 'k2']) + + expected = left.copy() + expected['v2'] = np.nan + expected['v2'][(expected.k1 == 2) & (expected.k2 == 'bar')] = 5 + expected['v2'][(expected.k1 == 1) & (expected.k2 == 'foo')] = 7 + + tm.assert_frame_equal(result, expected) + + # do a right join for an extra test + joined = merge(right, left, left_index=True, + right_on=['k1', 'k2'], how='right') + tm.assert_frame_equal(joined.ix[:, expected.columns], expected) + + def test_left_merge_na_buglet(self): + left = DataFrame({'id': list('abcde'), 'v1': randn(5), + 'v2': randn(5), 'dummy' : list('abcde'), + 'v3' : randn(5)}, + columns=['id', 'v1', 'v2', 'dummy', 'v3']) + right = DataFrame({'id' : ['a', 'b', np.nan, np.nan, np.nan], + 'sv3' : [1.234, 5.678, np.nan, np.nan, np.nan]}) + + merged = merge(left, right, on='id', how='left') + + rdf = right.drop(['id'], axis=1) + expected = left.join(rdf) + tm.assert_frame_equal(merged, expected) + + +def _check_join(left, right, result, join_col, how='left', + lsuffix='_x', rsuffix='_y'): + + # some smoke tests + for c in join_col: + assert(result[c].notnull().all()) + + left_grouped = left.groupby(join_col) + right_grouped = right.groupby(join_col) + + for group_key, group in result.groupby(join_col): + l_joined = _restrict_to_columns(group, left.columns, lsuffix) + r_joined = _restrict_to_columns(group, right.columns, rsuffix) + + try: + lgroup = left_grouped.get_group(group_key) + except KeyError: + if how in ('left', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(l_joined, left.columns, join_col) + else: + _assert_same_contents(l_joined, lgroup) + + try: + rgroup = right_grouped.get_group(group_key) + except KeyError: + if how in ('right', 'inner'): + raise AssertionError('key %s should not have been in the join' + % str(group_key)) + + _assert_all_na(r_joined, right.columns, join_col) + else: + _assert_same_contents(r_joined, rgroup) + + +def _restrict_to_columns(group, columns, suffix): + found = [c for c in group.columns + if c in columns or c.replace(suffix, '') in columns] + + # filter + group = group.ix[:, found] + + # get rid of suffixes, if any + group = group.rename(columns=lambda x: x.replace(suffix, '')) + + # put in the right order... + group = group.ix[:, columns] + + return group + +def _assert_same_contents(join_chunk, source): + NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly... + + jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values + svalues = source.fillna(NA_SENTINEL).drop_duplicates().values + + rows = set(tuple(row) for row in jvalues) + assert(len(rows) == len(source)) + assert(all(tuple(row) in rows for row in svalues)) + +def _assert_all_na(join_chunk, source_columns, join_col): + for c in source_columns: + if c in join_col: + continue + assert(join_chunk[c].isnull().all()) + + +def _join_by_hand(a, b, how='left'): + join_index = a.index.join(b.index, how=how) + + a_re = a.reindex(join_index) + b_re = b.reindex(join_index) + + result_columns = a.columns.append(b.columns) + + for col, s in b_re.iteritems(): + a_re[col] = s + return a_re.reindex(columns=result_columns) + +class TestConcatenate(unittest.TestCase): + + def setUp(self): + self.frame = DataFrame(tm.getSeriesData()) + self.mixed_frame = self.frame.copy() + self.mixed_frame['foo'] = 'bar' + + def test_append(self): + begin_index = self.frame.index[:5] + end_index = self.frame.index[5:] + + begin_frame = self.frame.reindex(begin_index) + end_frame = self.frame.reindex(end_index) + + appended = begin_frame.append(end_frame) + assert_almost_equal(appended['A'], self.frame['A']) + + del end_frame['A'] + partial_appended = begin_frame.append(end_frame) + self.assert_('A' in partial_appended) + + partial_appended = end_frame.append(begin_frame) + self.assert_('A' in partial_appended) + + # mixed type handling + appended = self.mixed_frame[:5].append(self.mixed_frame[5:]) + assert_frame_equal(appended, self.mixed_frame) + + # what to test here + mixed_appended = self.mixed_frame[:5].append(self.frame[5:]) + mixed_appended2 = self.frame[:5].append(self.mixed_frame[5:]) + + # all equal except 'foo' column + assert_frame_equal(mixed_appended.reindex(columns=['A', 'B', 'C', 'D']), + mixed_appended2.reindex(columns=['A', 'B', 'C', 'D'])) + + # append empty + empty = DataFrame({}) + + appended = self.frame.append(empty) + assert_frame_equal(self.frame, appended) + self.assert_(appended is not self.frame) + + appended = empty.append(self.frame) + assert_frame_equal(self.frame, appended) + self.assert_(appended is not self.frame) + + # overlap + self.assertRaises(Exception, self.frame.append, self.frame, + verify_integrity=True) + + def test_append_length0_frame(self): + df = DataFrame(columns=['A', 'B', 'C']) + df3 = DataFrame(index=[0, 1], columns=['A', 'B']) + df5 = df.append(df3) + + expected = DataFrame(index=[0, 1], columns=['A', 'B', 'C']) + assert_frame_equal(df5, expected) + + def test_append_records(self): + arr1 = np.zeros((2,),dtype=('i4,f4,a10')) + arr1[:] = [(1,2.,'Hello'),(2,3.,"World")] + + arr2 = np.zeros((3,),dtype=('i4,f4,a10')) + arr2[:] = [(3, 4.,'foo'), + (5, 6.,"bar"), + (7., 8., 'baz')] + + df1 = DataFrame(arr1) + df2 = DataFrame(arr2) + + result = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate((arr1, arr2))) + assert_frame_equal(result, expected) + + def test_append_different_columns(self): + df = DataFrame({'bools' : np.random.randn(10) > 0, + 'ints' : np.random.randint(0, 10, 10), + 'floats' : np.random.randn(10), + 'strings' : ['foo', 'bar'] * 5}) + + a = df[:5].ix[:, ['bools', 'ints', 'floats']] + b = df[5:].ix[:, ['strings', 'ints', 'floats']] + + appended = a.append(b) + self.assert_(isnull(appended['strings'][0:4]).all()) + self.assert_(isnull(appended['bools'][5:]).all()) + + def test_append_many(self): + chunks = [self.frame[:5], self.frame[5:10], + self.frame[10:15], self.frame[15:]] + + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result, self.frame) + + chunks[-1]['foo'] = 'bar' + result = chunks[0].append(chunks[1:]) + tm.assert_frame_equal(result.ix[:, self.frame.columns], self.frame) + self.assert_((result['foo'][15:] == 'bar').all()) + self.assert_(result['foo'][:15].isnull().all()) + + def test_append_preserve_index_name(self): + # #980 + df1 = DataFrame(data=None, columns=['A','B','C']) + df1 = df1.set_index(['A']) + df2 = DataFrame(data=[[1,4,7], [2,5,8], [3,6,9]], + columns=['A','B','C']) + df2 = df2.set_index(['A']) + + result = df1.append(df2) + self.assert_(result.index.name == 'A') + + def test_join_many(self): + df = DataFrame(np.random.randn(10, 6), columns=list('abcdef')) + df_list = [df[['a', 'b']], df[['c', 'd']], df[['e', 'f']]] + + joined = df_list[0].join(df_list[1:]) + tm.assert_frame_equal(joined, df) + + df_list = [df[['a', 'b']][:-2], + df[['c', 'd']][2:], df[['e', 'f']][1:9]] + + def _check_diff_index(df_list, result, exp_index): + reindexed = [x.reindex(exp_index) for x in df_list] + expected = reindexed[0].join(reindexed[1:]) + tm.assert_frame_equal(result, expected) + + + # different join types + joined = df_list[0].join(df_list[1:], how='outer') + _check_diff_index(df_list, joined, df.index) + + joined = df_list[0].join(df_list[1:]) + _check_diff_index(df_list, joined, df_list[0].index) + + joined = df_list[0].join(df_list[1:], how='inner') + _check_diff_index(df_list, joined, df.index[2:8]) + + self.assertRaises(ValueError, df_list[0].join, df_list[1:], on='a') + + def test_join_many_mixed(self): + df = DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df['key'] = ['foo', 'bar'] * 4 + df1 = df.ix[:, ['A', 'B']] + df2 = df.ix[:, ['C', 'D']] + df3 = df.ix[:, ['key']] + + result = df1.join([df2, df3]) + assert_frame_equal(result, df) + + def test_append_missing_column_proper_upcast(self): + df1 = DataFrame({'A' : np.array([1,2, 3, 4], dtype='i8')}) + df2 = DataFrame({'B' : np.array([True,False, True, False], + dtype=bool)}) + + appended = df1.append(df2, ignore_index=True) + self.assert_(appended['A'].dtype == 'f8') + self.assert_(appended['B'].dtype == 'O') + + def test_concat_with_group_keys(self): + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + # axis=0 + df = DataFrame(np.random.randn(3, 4)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1]) + exp_index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1, 1], + [0, 1, 2, 0, 1, 2, 3]]) + expected = DataFrame(np.r_[df.values, df2.values], + index=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1]) + exp_index2 = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], + [0, 1, 2, 0, 1, 2]]) + expected = DataFrame(np.r_[df.values, df.values], + index=exp_index2) + tm.assert_frame_equal(result, expected) + + # axis=1 + df = DataFrame(np.random.randn(4, 3)) + df2 = DataFrame(np.random.randn(4, 4)) + + result = concat([df, df2], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df2.values], + columns=exp_index) + tm.assert_frame_equal(result, expected) + + result = concat([df, df], keys=[0, 1], axis=1) + expected = DataFrame(np.c_[df.values, df.values], + columns=exp_index2) + tm.assert_frame_equal(result, expected) + + def test_concat_keys_specific_levels(self): + df = DataFrame(np.random.randn(10, 4)) + pieces = [df.ix[:, [0, 1]], df.ix[:, [2]], df.ix[:, [3]]] + level = ['three', 'two', 'one', 'zero'] + result = concat(pieces, axis=1, keys=['one', 'two', 'three'], + levels=[level], + names=['group_key']) + + self.assert_(np.array_equal(result.columns.levels[0], level)) + self.assertEqual(result.columns.names[0], 'group_key') + + def test_concat_dataframe_keys_bug(self): + t1 = DataFrame({'value': Series([1,2,3], + index=Index(['a', 'b', 'c'], name='id'))}) + t2 = DataFrame({'value': Series([7, 8], + index=Index(['a', 'b'], name = 'id'))}) + + # it works + result = concat([t1, t2], axis=1, keys=['t1', 't2']) + self.assertEqual(list(result.columns), [('t1', 'value'), + ('t2', 'value')]) + + def test_concat_dict(self): + frames = {'foo' : DataFrame(np.random.randn(4, 3)), + 'bar' : DataFrame(np.random.randn(4, 3)), + 'baz' : DataFrame(np.random.randn(4, 3)), + 'qux' : DataFrame(np.random.randn(4, 3))} + + sorted_keys = sorted(frames) + + result = concat(frames) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys) + tm.assert_frame_equal(result, expected) + + result = concat(frames, axis=1) + expected = concat([frames[k] for k in sorted_keys], keys=sorted_keys, + axis=1) + tm.assert_frame_equal(result, expected) + + keys = ['baz', 'foo', 'bar'] + result = concat(frames, keys=keys) + expected = concat([frames[k] for k in keys], keys=keys) + tm.assert_frame_equal(result, expected) + + def test_concat_ignore_index(self): + frame1 = DataFrame({"test1": ["a", "b", "c"], + "test2": [1,2,3], + "test3": [4.5, 3.2, 1.2]}) + frame2 = DataFrame({"test3": [5.2, 2.2, 4.3]}) + frame1.index = Index(["x", "y", "z"]) + frame2.index = Index(["x", "y", "q"]) + + v1 = concat([frame1, frame2], axis=1, ignore_index=True) + + nan = np.nan + expected = DataFrame([[nan,nan,nan, 4.3], + ['a', 1, 4.5, 5.2], + ['b', 2, 3.2, 2.2], + ['c', 3, 1.2, nan]], + index=Index(["q", "x", "y", "z"])) + + tm.assert_frame_equal(v1, expected) + + def test_concat_multiindex_with_keys(self): + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + frame = DataFrame(np.random.randn(10, 3), index=index, + columns=Index(['A', 'B', 'C'], name='exp')) + result = concat([frame, frame], keys=[0, 1], names=['iteration']) + + self.assertEqual(result.index.names, ['iteration'] + index.names) + tm.assert_frame_equal(result.ix[0], frame) + tm.assert_frame_equal(result.ix[1], frame) + self.assertEqual(result.index.nlevels, 3) + + def test_concat_keys_and_levels(self): + df = DataFrame(np.random.randn(1, 3)) + df2 = DataFrame(np.random.randn(1, 4)) + + levels = [['foo', 'baz'], ['one', 'two']] + names = ['first', 'second'] + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels, + names=names) + expected = concat([df, df2, df, df2]) + exp_index = MultiIndex(levels=levels + [[0]], + labels=[[0, 0, 1, 1], [0, 1, 0, 1], + [0, 0, 0, 0]], + names=names + [None]) + expected.index = exp_index + + assert_frame_equal(result, expected) + + # no names + + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + levels=levels) + self.assertEqual(result.index.names, [None] * 3) + + # no levels + result = concat([df, df2, df, df2], + keys=[('foo', 'one'), ('foo', 'two'), + ('baz', 'one'), ('baz', 'two')], + names=['first', 'second']) + self.assertEqual(result.index.names, ['first', 'second'] + [None]) + self.assert_(np.array_equal(result.index.levels[0], ['baz', 'foo'])) + + def test_concat_keys_levels_no_overlap(self): + # GH #1406 + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + + self.assertRaises(ValueError, concat, [df, df], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + self.assertRaises(ValueError, concat, [df, df2], + keys=['one', 'two'], levels=[['foo', 'bar', 'baz']]) + + def test_concat_rename_index(self): + a = DataFrame(np.random.rand(3,3), + columns=list('ABC'), + index=Index(list('abc'), name='index_a')) + b = DataFrame(np.random.rand(3,3), + columns=list('ABC'), + index=Index(list('abc'), name='index_b')) + + result = concat([a, b], keys=['key0', 'key1'], + names=['lvl0', 'lvl1']) + + exp = concat([a, b], keys=['key0', 'key1'], names=['lvl0']) + exp.index.names[1] = 'lvl1' + + tm.assert_frame_equal(result, exp) + self.assertEqual(result.index.names, exp.index.names) + + def test_crossed_dtypes_weird_corner(self): + columns = ['A', 'B', 'C', 'D'] + df1 = DataFrame({'A' : np.array([1, 2, 3, 4], dtype='f8'), + 'B' : np.array([1, 2, 3, 4], dtype='i8'), + 'C' : np.array([1, 2, 3, 4], dtype='f8'), + 'D' : np.array([1, 2, 3, 4], dtype='i8')}, + columns=columns) + + df2 = DataFrame({'A' : np.array([1, 2, 3, 4], dtype='i8'), + 'B' : np.array([1, 2, 3, 4], dtype='f8'), + 'C' : np.array([1, 2, 3, 4], dtype='i8'), + 'D' : np.array([1, 2, 3, 4], dtype='f8')}, + columns=columns) + + appended = df1.append(df2, ignore_index=True) + expected = DataFrame(np.concatenate([df1.values, df2.values], axis=0), + columns=columns) + tm.assert_frame_equal(appended, expected) + + df = DataFrame(np.random.randn(1, 3), index=['a']) + df2 = DataFrame(np.random.randn(1, 4), index=['b']) + result = concat([df, df2], keys=['one', 'two'], names=['first', 'second']) + self.assertEqual(result.index.names, ['first', 'second']) + + def test_handle_empty_objects(self): + df = DataFrame(np.random.randn(10, 4), columns=list('abcd')) + + baz = df[:5] + baz['foo'] = 'bar' + empty = df[5:5] + + frames = [baz, empty, empty, df[5:]] + concatted = concat(frames, axis=0) + + expected = df.ix[:, ['a', 'b', 'c', 'd', 'foo']] + expected['foo'] = expected['foo'].astype('O') + expected['foo'][:5] = 'bar' + + tm.assert_frame_equal(concatted, expected) + + def test_panel_join(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[:2, :10, :3] + p2 = panel.ix[2:, 5:, 2:] + + # left join + result = p1.join(p2) + expected = p1.copy() + expected['ItemC'] = p2['ItemC'] + tm.assert_panel_equal(result, expected) + + # right join + result = p1.join(p2, how='right') + expected = p2.copy() + expected['ItemA'] = p1['ItemA'] + expected['ItemB'] = p1['ItemB'] + expected = expected.reindex(items=['ItemA', 'ItemB', 'ItemC']) + tm.assert_panel_equal(result, expected) + + # inner join + result = p1.join(p2, how='inner') + expected = panel.ix[:, 5:10, 2:3] + tm.assert_panel_equal(result, expected) + + # outer join + result = p1.join(p2, how='outer') + expected = p1.reindex(major=panel.major_axis, + minor=panel.minor_axis) + expected = expected.join(p2.reindex(major=panel.major_axis, + minor=panel.minor_axis)) + tm.assert_panel_equal(result, expected) + + def test_panel_join_overlap(self): + panel = tm.makePanel() + tm.add_nans(panel) + + p1 = panel.ix[['ItemA', 'ItemB', 'ItemC']] + p2 = panel.ix[['ItemB', 'ItemC']] + + joined = p1.join(p2, lsuffix='_p1', rsuffix='_p2') + p1_suf = p1.ix[['ItemB', 'ItemC']].add_suffix('_p1') + p2_suf = p2.ix[['ItemB', 'ItemC']].add_suffix('_p2') + no_overlap = panel.ix[['ItemA']] + expected = p1_suf.join(p2_suf).join(no_overlap) + tm.assert_panel_equal(joined, expected) + + def test_panel_join_many(self): + tm.K = 10 + panel = tm.makePanel() + tm.K = 4 + + panels = [panel.ix[:2], panel.ix[2:6], panel.ix[6:]] + + joined = panels[0].join(panels[1:]) + tm.assert_panel_equal(joined, panel) + + panels = [panel.ix[:2, :-5], panel.ix[2:6, 2:], panel.ix[6:, 5:-7]] + + data_dict = {} + for p in panels: + data_dict.update(p.iterkv()) + + joined = panels[0].join(panels[1:], how='inner') + expected = Panel.from_dict(data_dict, intersect=True) + tm.assert_panel_equal(joined, expected) + + joined = panels[0].join(panels[1:], how='outer') + expected = Panel.from_dict(data_dict, intersect=False) + tm.assert_panel_equal(joined, expected) + + # edge cases + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='outer', lsuffix='foo', rsuffix='bar') + self.assertRaises(ValueError, panels[0].join, panels[1:], + how='right') + + def test_panel_concat_other_axes(self): + panel = tm.makePanel() + + p1 = panel.ix[:, :5, :] + p2 = panel.ix[:, 5:, :] + + result = concat([p1, p2], axis=1) + tm.assert_panel_equal(result, panel) + + p1 = panel.ix[:, :, :2] + p2 = panel.ix[:, :, 2:] + + result = concat([p1, p2], axis=2) + tm.assert_panel_equal(result, panel) + + # if things are a bit misbehaved + p1 = panel.ix[:2, :, :2] + p2 = panel.ix[:, :, 2:] + p1['ItemC'] = 'baz' + + result = concat([p1, p2], axis=2) + + expected = panel.copy() + expected['ItemC'] = expected['ItemC'].astype('O') + expected.ix['ItemC', :, :2] = 'baz' + tm.assert_panel_equal(result, expected) + + def test_concat_series(self): + ts = tm.makeTimeSeries() + ts.name = 'foo' + + pieces = [ts[:5], ts[5:15], ts[15:]] + + result = concat(pieces) + tm.assert_series_equal(result, ts) + self.assertEqual(result.name, ts.name) + + result = concat(pieces, keys=[0, 1, 2]) + expected = ts.copy() + + ts.index = DatetimeIndex(np.array(ts.index.values, dtype='M8[ns]')) + + exp_labels = [np.repeat([0, 1, 2], [len(x) for x in pieces]), + np.arange(len(ts))] + exp_index = MultiIndex(levels=[[0, 1, 2], ts.index], + labels=exp_labels) + expected.index = exp_index + tm.assert_series_equal(result, expected) + + def test_concat_series_axis1(self): + ts = tm.makeTimeSeries() + + pieces = [ts[:-2], ts[2:], ts[2:-2]] + + result = concat(pieces, axis=1) + expected = DataFrame(pieces).T + assert_frame_equal(result, expected) + + result = concat(pieces, keys=['A', 'B', 'C'], axis=1) + expected = DataFrame(pieces, index=['A', 'B', 'C']).T + assert_frame_equal(result, expected) + + def test_concat_single_with_key(self): + df = DataFrame(np.random.randn(10, 4)) + + result = concat([df], keys=['foo']) + expected = concat([df, df], keys=['foo', 'bar']) + tm.assert_frame_equal(result, expected[:10]) + + def test_concat_exclude_none(self): + df = DataFrame(np.random.randn(10, 4)) + + pieces = [df[:5], None, None, df[5:]] + result = concat(pieces) + tm.assert_frame_equal(result, df) + self.assertRaises(Exception, concat, [None, None]) + + def test_concat_datetime64_block(self): + from pandas.tseries.index import date_range + + rng = date_range('1/1/2000', periods=10) + + df = DataFrame({'time': rng}) + + result = concat([df, df]) + self.assert_((result[:10]['time'] == rng).all()) + +class TestOrderedMerge(unittest.TestCase): + + def setUp(self): + self.left = DataFrame({'key': ['a', 'c', 'e'], + 'lvalue': [1, 2., 3]}) + + self.right = DataFrame({'key': ['b', 'c', 'd', 'f'], + 'rvalue': [1, 2, 3., 4]}) + + # GH #813 + + def test_basic(self): + result = ordered_merge(self.left, self.right, on='key') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1, nan, 2, nan, 3, nan], + 'rvalue': [nan, 1, 2, 3, nan, 4]}) + + assert_frame_equal(result, expected) + + def test_ffill(self): + result = ordered_merge(self.left, self.right, on='key', fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'], + 'lvalue': [1., 1, 2, 2, 3, 3.], + 'rvalue': [nan, 1, 2, 3, 3, 4]}) + assert_frame_equal(result, expected) + + def test_multigroup(self): + left = concat([self.left, self.left], ignore_index=True) + # right = concat([self.right, self.right], ignore_index=True) + + left['group'] = ['a'] * 3 + ['b'] * 3 + # right['group'] = ['a'] * 4 + ['b'] * 4 + + result = ordered_merge(left, self.right, on='key', left_by='group', + fill_method='ffill') + expected = DataFrame({'key': ['a', 'b', 'c', 'd', 'e', 'f'] * 2, + 'lvalue': [1., 1, 2, 2, 3, 3.] * 2, + 'rvalue': [nan, 1, 2, 3, 3, 4] * 2}) + expected['group'] = ['a'] * 6 + ['b'] * 6 + + assert_frame_equal(result, expected.ix[:, result.columns]) + + result2 = ordered_merge(self.right, left, on='key', right_by='group', + fill_method='ffill') + assert_frame_equal(result, result2.ix[:, result.columns]) + + result = ordered_merge(left, self.right, on='key', left_by='group') + self.assert_(result['group'].notnull().all()) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + + diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py new file mode 100644 index 00000000..423fae8e --- /dev/null +++ b/pandas/tools/tests/test_pivot.py @@ -0,0 +1,326 @@ +import unittest + +import numpy as np + +from pandas import DataFrame, Series +from pandas.tools.merge import concat +from pandas.tools.pivot import pivot_table, crosstab +import pandas.util.testing as tm + +class TestPivotTable(unittest.TestCase): + + def setUp(self): + self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + def test_pivot_table(self): + rows = ['A', 'B'] + cols= 'C' + table = pivot_table(self.data, values='D', rows=rows, cols=cols) + + table2 = self.data.pivot_table(values='D', rows=rows, cols=cols) + tm.assert_frame_equal(table, table2) + + # this works + pivot_table(self.data, values='D', rows=rows) + + if len(rows) > 1: + self.assertEqual(table.index.names, rows) + else: + self.assertEqual(table.index.name, rows[0]) + + if len(cols) > 1: + self.assertEqual(table.columns.names, cols) + else: + self.assertEqual(table.columns.name, cols[0]) + + expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack() + tm.assert_frame_equal(table, expected) + + def test_pass_array(self): + result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C) + expected = self.data.pivot_table('D', rows='A', cols='C') + tm.assert_frame_equal(result, expected) + + def test_pass_function(self): + result = self.data.pivot_table('D', rows=lambda x: x // 5, + cols=self.data.C) + expected = self.data.pivot_table('D', rows=self.data.index // 5, + cols='C') + tm.assert_frame_equal(result, expected) + + def test_pivot_table_multiple(self): + rows = ['A', 'B'] + cols= 'C' + table = pivot_table(self.data, rows=rows, cols=cols) + expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack() + tm.assert_frame_equal(table, expected) + + def test_pivot_multi_values(self): + result = pivot_table(self.data, values=['D', 'E'], + rows='A', cols=['B', 'C'], fill_value=0) + expected = pivot_table(self.data.drop(['F'], axis=1), + rows='A', cols=['B', 'C'], fill_value=0) + tm.assert_frame_equal(result, expected) + + def test_pivot_multi_functions(self): + f = lambda func: pivot_table(self.data, values=['D', 'E'], + rows=['A', 'B'], cols='C', + aggfunc=func) + result = f([np.mean, np.std]) + means = f(np.mean) + stds = f(np.std) + expected = concat([means, stds], keys=['mean', 'std'], axis=1) + tm.assert_frame_equal(result, expected) + + # margins not supported?? + f = lambda func: pivot_table(self.data, values=['D', 'E'], + rows=['A', 'B'], cols='C', + aggfunc=func, margins=True) + result = f([np.mean, np.std]) + means = f(np.mean) + stds = f(np.std) + expected = concat([means, stds], keys=['mean', 'std'], axis=1) + tm.assert_frame_equal(result, expected) + + def test_margins(self): + def _check_output(res, col, rows=['A', 'B'], cols=['C']): + cmarg = res['All'][:-1] + exp = self.data.groupby(rows)[col].mean() + tm.assert_series_equal(cmarg, exp) + + rmarg = res.xs(('All', ''))[:-1] + exp = self.data.groupby(cols)[col].mean() + tm.assert_series_equal(rmarg, exp) + + gmarg = res['All']['All', ''] + exp = self.data[col].mean() + self.assertEqual(gmarg, exp) + + # column specified + table = self.data.pivot_table('D', rows=['A', 'B'], cols='C', + margins=True, aggfunc=np.mean) + _check_output(table, 'D') + + # no column specified + table = self.data.pivot_table(rows=['A', 'B'], cols='C', + margins=True, aggfunc=np.mean) + for valcol in table.columns.levels[0]: + _check_output(table[valcol], valcol) + + # no col + + # to help with a buglet + self.data.columns = [k * 2 for k in self.data.columns] + table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, + aggfunc=np.mean) + for valcol in table.columns: + gmarg = table[valcol]['All', ''] + self.assertEqual(gmarg, self.data[valcol].mean()) + + # this is OK + table = self.data.pivot_table(rows=['AA', 'BB'], margins=True, + aggfunc='mean') + + # no rows + rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True, + aggfunc=np.mean) + self.assert_(isinstance(rtable, Series)) + for item in ['DD', 'EE', 'FF']: + gmarg = table[item]['All', ''] + self.assertEqual(gmarg, self.data[item].mean()) + + def test_pivot_integer_columns(self): + # caused by upstream bug in unstack + from pandas.util.compat import product + import datetime + import pandas + + d = datetime.date.min + data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], + [d + datetime.timedelta(i) for i in xrange(20)], [1.0])) + df = pandas.DataFrame(data) + table = df.pivot_table(values=4, rows=[0,1,3],cols=[2]) + + df2 = df.rename(columns=str) + table2 = df2.pivot_table(values='4', rows=['0','1','3'], cols=['2']) + + tm.assert_frame_equal(table, table2) + + def test_pivot_no_level_overlap(self): + # GH #1181 + + data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2, + 'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2, + 'c': (['foo'] * 4 + ['bar'] * 4) * 2, + 'value': np.random.randn(16)}) + + table = data.pivot_table('value', rows='a', cols=['b', 'c']) + + grouped = data.groupby(['a', 'b', 'c'])['value'].mean() + expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all') + tm.assert_frame_equal(table, expected) + + def test_pivot_columns_lexsorted(self): + import datetime + import numpy as np + import pandas + + n = 10000 + + dtype = np.dtype([ + ("Index", object), + ("Symbol", object), + ("Year", int), + ("Month", int), + ("Day", int), + ("Quantity", int), + ("Price", float), + ]) + + products = np.array([ + ('SP500', 'ADBE'), + ('SP500', 'NVDA'), + ('SP500', 'ORCL'), + ('NDQ100', 'AAPL'), + ('NDQ100', 'MSFT'), + ('NDQ100', 'GOOG'), + ('FTSE', 'DGE.L'), + ('FTSE', 'TSCO.L'), + ('FTSE', 'GSK.L'), + ], dtype=[('Index', object), ('Symbol', object)]) + items = np.empty(n, dtype=dtype) + iproduct = np.random.randint(0, len(products), n) + items['Index'] = products['Index'][iproduct] + items['Symbol'] = products['Symbol'][iproduct] + dr = pandas.date_range(datetime.date(2000, 1, 1), + datetime.date(2010, 12, 31)) + dates = dr[np.random.randint(0, len(dr), n)] + items['Year'] = dates.year + items['Month'] = dates.month + items['Day'] = dates.day + items['Price'] = np.random.lognormal(4.0, 2.0, n) + + df = DataFrame(items) + + pivoted = df.pivot_table('Price', rows=['Month', 'Day'], + cols=['Index', 'Symbol', 'Year'], + aggfunc='mean') + + self.assert_(pivoted.columns.is_monotonic) + +class TestCrosstab(unittest.TestCase): + + def setUp(self): + df = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo', + 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B' : ['one', 'one', 'one', 'two', + 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C' : ['dull', 'dull', 'shiny', 'dull', + 'dull', 'shiny', 'shiny', 'dull', + 'shiny', 'shiny', 'shiny'], + 'D' : np.random.randn(11), + 'E' : np.random.randn(11), + 'F' : np.random.randn(11)}) + + self.df = df.append(df, ignore_index=True) + + def test_crosstab_single(self): + df = self.df + result = crosstab(df['A'], df['C']) + expected = df.groupby(['A', 'C']).size().unstack() + tm.assert_frame_equal(result, expected.fillna(0).astype(np.int64)) + + def test_crosstab_multiple(self): + df = self.df + + result = crosstab(df['A'], [df['B'], df['C']]) + expected = df.groupby(['A', 'B', 'C']).size() + expected = expected.unstack('B').unstack('C').fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + result = crosstab([df['B'], df['C']], df['A']) + expected = df.groupby(['B', 'C', 'A']).size() + expected = expected.unstack('A').fillna(0).astype(np.int64) + tm.assert_frame_equal(result, expected) + + def test_crosstab_ndarray(self): + a = np.random.randint(0, 5, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 10, size=100) + + df = DataFrame({'a': a, 'b': b, 'c': c}) + + result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c')) + expected = crosstab(df['a'], [df['b'], df['c']]) + tm.assert_frame_equal(result, expected) + + result = crosstab([b, c], a, colnames=['a'], rownames=('b', 'c')) + expected = crosstab([df['b'], df['c']], df['a']) + tm.assert_frame_equal(result, expected) + + # assign arbitrary names + result = crosstab(self.df['A'].values, self.df['C'].values) + self.assertEqual(result.index.name, 'row_0') + self.assertEqual(result.columns.name, 'col_0') + + def test_crosstab_margins(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + + df = DataFrame({'a': a, 'b': b, 'c': c}) + + result = crosstab(a, [b, c], rownames=['a'], colnames=('b', 'c'), + margins=True) + + self.assertEqual(result.index.names, ['a']) + self.assertEqual(result.columns.names, ['b', 'c']) + + all_cols = result['All', ''] + exp_cols = df.groupby(['a']).size().astype('i8') + exp_cols = exp_cols.append(Series([len(df)], index=['All'])) + + tm.assert_series_equal(all_cols, exp_cols) + + all_rows = result.ix['All'] + exp_rows = df.groupby(['b', 'c']).size().astype('i8') + exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) + + exp_rows = exp_rows.reindex(all_rows.index) + exp_rows = exp_rows.fillna(0).astype(np.int64) + tm.assert_series_equal(all_rows, exp_rows) + + def test_crosstab_pass_values(self): + a = np.random.randint(0, 7, size=100) + b = np.random.randint(0, 3, size=100) + c = np.random.randint(0, 5, size=100) + values = np.random.randn(100) + + table = crosstab([a, b], c, values, aggfunc=np.sum, + rownames=['foo', 'bar'], colnames=['baz']) + + df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values' : values}) + + expected = df.pivot_table('values', rows=['foo', 'bar'], cols='baz', + aggfunc=np.sum) + tm.assert_frame_equal(table, expected) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + + diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py new file mode 100644 index 00000000..8dad2599 --- /dev/null +++ b/pandas/tools/tests/test_tile.py @@ -0,0 +1,187 @@ +import nose +import unittest + +import numpy as np + +from pandas import DataFrame, Series, unique +import pandas.util.testing as tm +import pandas.core.common as com + +from pandas.core.algorithms import quantile +from pandas.tools.tile import cut, qcut +import pandas.tools.tile as tmod + +from numpy.testing import assert_equal, assert_almost_equal + +class TestCut(unittest.TestCase): + + def test_simple(self): + data = np.ones(5) + result = cut(data, 4, labels=False) + desired = [1, 1, 1, 1, 1] + assert_equal(result, desired) + + def test_bins(self): + data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) + result, bins = cut(data, 3, retbins=True) + assert_equal(result.labels, [0, 0, 0, 1, 2, 0]) + assert_almost_equal(bins, [ 0.1905, 3.36666667, 6.53333333, 9.7]) + + def test_right(self): + data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + result, bins = cut(data, 4, right=True, retbins=True) + assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 0]) + assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) + + def test_noright(self): + data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) + result, bins = cut(data, 4, right=False, retbins=True) + assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 1]) + assert_almost_equal(bins, [ 0.2, 2.575, 4.95, 7.325, 9.7095]) + + def test_arraylike(self): + data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + result, bins = cut(data, 3, retbins=True) + assert_equal(result.labels, [0, 0, 0, 1, 2, 0]) + assert_almost_equal(bins, [ 0.1905, 3.36666667, 6.53333333, 9.7]) + + def test_bins_not_monotonic(self): + data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + self.assertRaises(ValueError, cut, data, [0.1, 1.5, 1, 10]) + + def test_wrong_num_labels(self): + data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] + self.assertRaises(ValueError, cut, data, [0, 1, 10], + labels=['foo', 'bar', 'baz']) + + def test_cut_corner(self): + # h3h + self.assertRaises(ValueError, cut, [], 2) + + self.assertRaises(ValueError, cut, [1, 2, 3], 0.5) + + def test_cut_out_of_range_more(self): + # #1511 + s = Series([0, -1, 0, 1, -3]) + ind = cut(s, [0, 1], labels=False) + exp = [np.nan, np.nan, np.nan, 0, np.nan] + assert_almost_equal(ind, exp) + + def test_labels(self): + arr = np.tile(np.arange(0, 1.01, 0.1), 4) + + result, bins = cut(arr, 4, retbins=True) + ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', + '(0.75, 1]'] + self.assert_(np.array_equal(result.levels, ex_levels)) + + result, bins = cut(arr, 4, retbins=True, right=False) + ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', + '[0.75, 1.001)'] + self.assert_(np.array_equal(result.levels, ex_levels)) + + def test_cut_pass_series_name_to_factor(self): + s = Series(np.random.randn(100), name='foo') + + factor = cut(s, 4) + self.assertEquals(factor.name, 'foo') + + def test_label_precision(self): + arr = np.arange(0, 0.73, 0.01) + + result = cut(arr, 4, precision=2) + ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', + '(0.54, 0.72]'] + self.assert_(np.array_equal(result.levels, ex_levels)) + + def test_na_handling(self): + arr = np.arange(0, 0.75, 0.01) + arr[::3] = np.nan + + result = cut(arr, 4) + + result_arr = np.asarray(result) + + ex_arr = np.where(com.isnull(arr), np.nan, result_arr) + + tm.assert_almost_equal(result_arr, ex_arr) + + result = cut(arr, 4, labels=False) + ex_result = np.where(com.isnull(arr), np.nan, result) + tm.assert_almost_equal(result, ex_result) + + def test_qcut(self): + arr = np.random.randn(1000) + + labels, bins = qcut(arr, 4, retbins=True) + ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) + assert_almost_equal(bins, ex_bins) + + ex_levels = cut(arr, ex_bins, include_lowest=True) + self.assert_(np.array_equal(labels, ex_levels)) + + def test_qcut_bounds(self): + np.random.seed(12345) + arr = np.random.randn(1000) + + factor = qcut(arr, 10, labels=False) + self.assert_(len(np.unique(factor)) == 10) + + def test_qcut_specify_quantiles(self): + arr = np.random.randn(100) + + factor = qcut(arr, [0, .25, .5, .75, 1.]) + expected = qcut(arr, 4) + self.assert_(factor.equals(expected)) + + def test_cut_out_of_bounds(self): + np.random.seed(12345) + + arr = np.random.randn(100) + + result = cut(arr, [-1, 0, 1]) + + mask = result.labels == -1 + ex_mask = (arr < -1) | (arr > 1) + self.assert_(np.array_equal(mask, ex_mask)) + + def test_cut_pass_labels(self): + arr = [50, 5, 10, 15, 20, 30, 70] + bins = [0, 25, 50, 100] + labels = ['Small', 'Medium', 'Large'] + + result = cut(arr, bins, labels=labels) + + exp = cut(arr, bins) + exp.levels = labels + + self.assert_(result.equals(exp)) + + def test_qcut_include_lowest(self): + values = np.arange(10) + + cats = qcut(values, 4) + + ex_levels = ['[0, 2.25]', '(2.25, 4.5]', '(4.5, 6.75]', '(6.75, 9]'] + self.assert_((cats.levels == ex_levels).all()) + + def test_qcut_nas(self): + arr = np.random.randn(100) + arr[:20] = np.nan + + result = qcut(arr, 4) + self.assert_(com.isnull(result[:20]).all()) + + def test_label_formatting(self): + self.assertEquals(tmod._trim_zeros('1.000'), '1') + + # it works + result = cut(np.arange(11.), 2) + + result = cut(np.arange(11.) / 1e10, 2) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + + diff --git a/pandas/tools/tests/test_tools.py b/pandas/tools/tests/test_tools.py new file mode 100644 index 00000000..baaf78ca --- /dev/null +++ b/pandas/tools/tests/test_tools.py @@ -0,0 +1,21 @@ +#import unittest + +from pandas import DataFrame +from pandas.tools.describe import value_range + +import numpy as np + +def test_value_range(): + df = DataFrame(np.random.randn(5, 5)) + df.ix[0,2] = -5 + df.ix[2,0] = 5 + + res = value_range(df) + + assert( res['Minimum'] == -5 ) + assert( res['Maximum'] == 5 ) + + df.ix[0,1] = np.NaN + + assert( res['Minimum'] == -5 ) + assert( res['Maximum'] == 5 ) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py new file mode 100644 index 00000000..dc756e8c --- /dev/null +++ b/pandas/tools/tile.py @@ -0,0 +1,218 @@ +""" +Quantilization functions and related stuff +""" + +from pandas.core.api import DataFrame, Series +from pandas.core.categorical import Categorical +import pandas.core.algorithms as algos +import pandas.core.common as com +import pandas.core.nanops as nanops + +import numpy as np + + +def cut(x, bins, right=True, labels=None, retbins=False, precision=3, + include_lowest=False): + """ + Return indices of half-open bins to which each value of `x` belongs. + + Parameters + ---------- + x : array-like + Input array to be binned. It has to be 1-dimensional. + bins : int or sequence of scalars + If `bins` is an int, it defines the number of equal-width bins in the + range of `x`. However, in this case, the range of `x` is extended + by .1% on each side to include the min or max values of `x`. If + `bins` is a sequence it defines the bin edges allowing for + non-uniform bin width. No extension of the range of `x` is done in + this case. + right : bool, optional + Indicates whether the bins include the rightmost edge or not. If + right == True (the default), then the bins [1,2,3,4] indicate + (1,2], (2,3], (3,4]. + labels : array or boolean, default None + Labels to use for bin edges, or False to return integer bin labels + retbins : bool, optional + Whether to return the bins or not. Can be useful if bins is given + as a scalar. + + Returns + ------- + out : Categorical or array of integers if labels is False + bins : ndarray of floats + Returned only if `retbins` is True. + + Notes + ----- + The `cut` function can be useful for going from a continuous variable to + a categorical variable. For example, `cut` could convert ages to groups + of age ranges. + + Any NA values will be NA in the result. Out of bounds values will be NA in + the resulting Categorical object + + + Examples + -------- + >>> cut(np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]), 3, retbins=True) + (array([(0.191, 3.367], (0.191, 3.367], (0.191, 3.367], (3.367, 6.533], + (6.533, 9.7], (0.191, 3.367]], dtype=object), + array([ 0.1905 , 3.36666667, 6.53333333, 9.7 ])) + >>> cut(np.ones(5), 4, labels=False) + array([2, 2, 2, 2, 2]) + """ + #NOTE: this binning code is changed a bit from histogram for var(x) == 0 + if not np.iterable(bins): + if np.isscalar(bins) and bins < 1: + raise ValueError("`bins` should be a positive integer.") + try: # for array-like + sz = x.size + except AttributeError: + x = np.asarray(x) + sz = x.size + if sz == 0: + raise ValueError('Cannot cut empty array') + # handle empty arrays. Can't determine range, so use 0-1. + # rng = (0, 1) + else: + rng = (nanops.nanmin(x), nanops.nanmax(x)) + mn, mx = [mi + 0.0 for mi in rng] + + if mn == mx: # adjust end points before binning + mn -= .001 * mn + mx += .001 * mx + bins = np.linspace(mn, mx, bins+1, endpoint=True) + else: # adjust end points after binning + bins = np.linspace(mn, mx, bins+1, endpoint=True) + adj = (mx - mn) * 0.001 # 0.1% of the range + if right: + bins[0] -= adj + else: + bins[-1] += adj + + else: + bins = np.asarray(bins) + if (np.diff(bins) < 0).any(): + raise ValueError('bins must increase monotonically.') + + return _bins_to_cuts(x, bins, right=right, labels=labels, + retbins=retbins, precision=precision, + include_lowest=include_lowest) + + + +def qcut(x, q, labels=None, retbins=False, precision=3): + """ + Quantile-based discretization function. Discretize variable into + equal-sized buckets based on rank or based on sample quantiles. For example + 1000 values for 10 quantiles would produce 1000 integers from 0 to 9 + indicating the + + Parameters + ---------- + x : ndarray or Series + q : integer or array of quantiles + Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately + array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles + labels : array or boolean, default None + Labels to use for bin edges, or False to return integer bin labels + retbins : bool, optional + Whether to return the bins or not. Can be useful if bins is given + as a scalar. + + Returns + ------- + cat : Categorical + + Notes + ----- + Out of bounds values will be NA in the resulting Categorical object + + Examples + -------- + """ + if com.is_integer(q): + quantiles = np.linspace(0, 1, q + 1) + else: + quantiles = q + bins = algos.quantile(x, quantiles) + return _bins_to_cuts(x, bins, labels=labels, retbins=retbins, + precision=precision, include_lowest=True) + + +def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, + precision=3, name=None, include_lowest=False): + if name is None and isinstance(x, Series): + name = x.name + x = np.asarray(x) + + side = 'left' if right else 'right' + ids = bins.searchsorted(x, side=side) + + if include_lowest: + ids[x == bins[0]] = 1 + + na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0) + has_nas = na_mask.any() + + if labels is not False: + if labels is None: + fmt = lambda v: _format_label(v, precision=precision) + if right: + levels = ['(%s, %s]' % (fmt(a), fmt(b)) + for a, b in zip(bins, bins[1:])] + if include_lowest: + levels[0] = '[' + levels[0][1:] + else: + levels = ['[%s, %s)' % (fmt(a), fmt(b)) + for a, b in zip(bins, bins[1:])] + + else: + if len(labels) != len(bins) - 1: + raise ValueError('Bin labels must be one fewer than ' + 'the number of bin edges') + levels = labels + + levels = np.asarray(levels, dtype=object) + np.putmask(ids, na_mask, 0) + fac = Categorical(ids - 1, levels, name=name) + else: + fac = ids - 1 + if has_nas: + fac = fac.astype(np.float64) + np.putmask(fac, na_mask, np.nan) + + if not retbins: + return fac + + return fac, bins + + +def _format_label(x, precision=3): + fmt_str = '%%.%dg' % precision + if com.is_float(x): + frac, whole = np.modf(x) + sgn = '-' if x < 0 else '' + whole = abs(whole) + if frac != 0.0: + val = fmt_str % frac + if 'e' in val: + return _trim_zeros(fmt_str % x) + else: + val = _trim_zeros(val) + if '.' in val: + return sgn + '.'.join(('%d' % whole, val.split('.')[1])) + else: # pragma: no cover + return sgn + '.'.join(('%d' % whole, val)) + else: + return sgn + '%d' % whole + else: + return str(x) + +def _trim_zeros(x): + while len(x) > 1 and x[-1] == '0': + x = x[:-1] + if len(x) > 1 and x[-1] == '.': + x = x[:-1] + return x diff --git a/pandas/tools/util.py b/pandas/tools/util.py new file mode 100644 index 00000000..f10bfe1e --- /dev/null +++ b/pandas/tools/util.py @@ -0,0 +1,6 @@ +from pandas.core.index import Index + +def match(needles, haystack): + haystack = Index(haystack) + needles = Index(needles) + return haystack.get_indexer(needles) diff --git a/pandas/tseries/__init__.py b/pandas/tseries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tseries/api.py b/pandas/tseries/api.py new file mode 100644 index 00000000..ead5a17c --- /dev/null +++ b/pandas/tseries/api.py @@ -0,0 +1,11 @@ +""" + +""" + + +from pandas.tseries.index import DatetimeIndex, date_range, bdate_range +from pandas.tseries.frequencies import infer_freq +from pandas.tseries.period import Period, PeriodIndex, period_range, pnow +from pandas.tseries.resample import TimeGrouper +from pandas.lib import NaT +import pandas.tseries.offsets as offsets diff --git a/pandas/tseries/converter.py b/pandas/tseries/converter.py new file mode 100644 index 00000000..86245ad4 --- /dev/null +++ b/pandas/tseries/converter.py @@ -0,0 +1,697 @@ +from datetime import datetime +import datetime as pydt +import numpy as np + +import matplotlib.units as units +import matplotlib.dates as dates +from matplotlib.ticker import Formatter, AutoLocator, Locator +from matplotlib.transforms import nonsingular + +import pandas.lib as lib +import pandas.core.common as com +from pandas.core.index import Index + +import pandas.tseries.tools as tools +import pandas.tseries.frequencies as frequencies +from pandas.tseries.frequencies import FreqGroup +from pandas.tseries.period import Period, PeriodIndex + +def register(): + units.registry[pydt.time] = TimeConverter() + units.registry[lib.Timestamp] = DatetimeConverter() + units.registry[pydt.date] = DatetimeConverter() + units.registry[pydt.datetime] = DatetimeConverter() + units.registry[Period] = PeriodConverter() + +def _to_ordinalf(tm): + tot_sec = tm.hour * 3600 + tm.minute * 60 + tm.second + tm.microsecond + return tot_sec + +def time2num(d): + if isinstance(d, str): + parsed = tools.to_datetime(d) + if not isinstance(parsed, datetime): + raise ValueError('Could not parse time %s' % d) + return _to_ordinalf(parsed.time()) + if isinstance(d, pydt.time): + return _to_ordinalf(d) + return d + +class TimeConverter(units.ConversionInterface): + + @staticmethod + def convert(value, unit, axis): + valid_types = (str, pydt.time) + if (isinstance(value, valid_types) or com.is_integer(value) or + com.is_float(value)): + return time2num(value) + if isinstance(value, Index): + return value.map(time2num) + if isinstance(value, (list, tuple, np.ndarray)): + return [time2num(x) for x in value] + return value + + @staticmethod + def axisinfo(unit, axis): + if unit != 'time': + return None + + majloc = AutoLocator() + majfmt = TimeFormatter(majloc) + return units.AxisInfo(majloc=majloc, majfmt=majfmt, label='time') + + @staticmethod + def default_units(x, axis): + return 'time' + +### time formatter +class TimeFormatter(Formatter): + + def __init__(self, locs): + self.locs = locs + + def __call__(self, x, pos=0): + fmt = '%H:%M:%S' + s = int(x) + us = int((x - s) * 1e6) + m, s = divmod(s, 60) + h, m = divmod(m, 60) + return pydt.time(h, m, s, us).strftime(fmt) + +### Period Conversion + +class PeriodConverter(dates.DateConverter): + + @staticmethod + def convert(values, units, axis): + if not hasattr(axis, 'freq'): + raise TypeError('Axis must have `freq` set to convert to Periods') + valid_types = (str, datetime, Period, pydt.date, pydt.time) + if (isinstance(values, valid_types) or com.is_integer(values) or + com.is_float(values)): + return get_datevalue(values, axis.freq) + if isinstance(values, Index): + return values.map(lambda x: get_datevalue(x, axis.freq)) + if isinstance(values, (list, tuple, np.ndarray)): + return [get_datevalue(x, axis.freq) for x in values] + return values + +def get_datevalue(date, freq): + if isinstance(date, Period): + return date.asfreq(freq).ordinal + elif isinstance(date, (str, datetime, pydt.date, pydt.time)): + return Period(date, freq).ordinal + elif (com.is_integer(date) or com.is_float(date) or + (isinstance(date, np.ndarray) and (date.size == 1))): + return date + elif date is None: + return None + raise ValueError("Unrecognizable date '%s'" % date) + +### Datetime Conversion +class DatetimeConverter(dates.DateConverter): + + @staticmethod + def convert(values, unit, axis): + def try_parse(values): + try: + return tools.to_datetime(values).toordinal() + except Exception: + return values + + if isinstance(values, (datetime, pydt.date)): + return values.toordinal() + elif isinstance(values, pydt.time): + return dates.date2num(values) + elif (com.is_integer(values) or com.is_float(values)): + return values + elif isinstance(values, str): + return try_parse(values) + elif isinstance(values, Index): + return values.map(try_parse) + elif isinstance(values, (list, tuple, np.ndarray)): + return [try_parse(x) for x in values] + return values + +### Fixed frequency dynamic tick locators and formatters + +##### ------------------------------------------------------------------------- +#---- --- Locators --- +##### ------------------------------------------------------------------------- + +def _get_default_annual_spacing(nyears): + """ + Returns a default spacing between consecutive ticks for annual data. + """ + if nyears < 11: + (min_spacing, maj_spacing) = (1, 1) + elif nyears < 20: + (min_spacing, maj_spacing) = (1, 2) + elif nyears < 50: + (min_spacing, maj_spacing) = (1, 5) + elif nyears < 100: + (min_spacing, maj_spacing) = (5, 10) + elif nyears < 200: + (min_spacing, maj_spacing) = (5, 25) + elif nyears < 600: + (min_spacing, maj_spacing) = (10, 50) + else: + factor = nyears // 1000 + 1 + (min_spacing, maj_spacing) = (factor * 20, factor * 100) + return (min_spacing, maj_spacing) + +def period_break(dates, period): + """ + Returns the indices where the given period changes. + + Parameters + ---------- + dates : PeriodIndex + Array of intervals to monitor. + period : string + Name of the period to monitor. + """ + current = getattr(dates, period) + previous = getattr(dates - 1, period) + return (current - previous).nonzero()[0] + +def has_level_label(label_flags, vmin): + """ + Returns true if the ``label_flags`` indicate there is at least one label + for this level. + + if the minimum view limit is not an exact integer, then the first tick + label won't be shown, so we must adjust for that. + """ + if label_flags.size == 0 or (label_flags.size == 1 and + label_flags[0] == 0 and + vmin % 1 > 0.0): + return False + else: + return True + +def _daily_finder(vmin, vmax, freq): + periodsperday = -1 + + if freq >= FreqGroup.FR_HR: + if freq == FreqGroup.FR_SEC: + periodsperday = 24 * 60 * 60 + elif freq == FreqGroup.FR_MIN: + periodsperday = 24 * 60 + elif freq == FreqGroup.FR_HR: + periodsperday = 24 + else: # pragma: no cover + raise ValueError("unexpected frequency: %s" % freq) + periodsperyear = 365 * periodsperday + periodspermonth = 28 * periodsperday + + elif freq == FreqGroup.FR_BUS: + periodsperyear = 261 + periodspermonth = 19 + elif freq == FreqGroup.FR_DAY: + periodsperyear = 365 + periodspermonth = 28 + elif frequencies.get_freq_group(freq) == FreqGroup.FR_WK: + periodsperyear = 52 + periodspermonth = 3 + elif freq == FreqGroup.FR_UND: + periodsperyear = 100 + periodspermonth = 10 + else: # pragma: no cover + raise ValueError("unexpected frequency") + + # save this for later usage + vmin_orig = vmin + + (vmin, vmax) = (Period(ordinal=int(vmin), freq=freq), + Period(ordinal=int(vmax), freq=freq)) + span = vmax.ordinal - vmin.ordinal + 1 + dates_ = PeriodIndex(start=vmin, end=vmax, freq=freq) + # Initialize the output + info = np.zeros(span, + dtype=[('val', np.int64), ('maj', bool), + ('min', bool), ('fmt', '|S20')]) + info['val'][:] = dates_.values + info['fmt'][:] = '' + info['maj'][[0, -1]] = True + # .. and set some shortcuts + info_maj = info['maj'] + info_min = info['min'] + info_fmt = info['fmt'] + + def first_label(label_flags): + if (label_flags[0] == 0) and (label_flags.size > 1) and \ + ((vmin_orig % 1) > 0.0): + return label_flags[1] + else: + return label_flags[0] + + # Case 1. Less than a month + if span <= periodspermonth: + day_start = period_break(dates_, 'day') + month_start = period_break(dates_, 'month') + + def _hour_finder(label_interval, force_year_start): + _hour = dates_.hour + _prev_hour = (dates_-1).hour + hour_start = (_hour - _prev_hour) != 0 + info_maj[day_start] = True + info_min[hour_start & (_hour % label_interval == 0)] = True + year_start = period_break(dates_, 'year') + info_fmt[hour_start & (_hour % label_interval == 0)] = '%H:%M' + info_fmt[day_start] = '%H:%M\n%d-%b' + info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + if force_year_start and not has_level_label(year_start, vmin_orig): + info_fmt[first_label(day_start)] = '%H:%M\n%d-%b\n%Y' + + def _minute_finder(label_interval): + hour_start = period_break(dates_, 'hour') + _minute = dates_.minute + _prev_minute = (dates_-1).minute + minute_start = (_minute - _prev_minute) != 0 + info_maj[hour_start] = True + info_min[minute_start & (_minute % label_interval == 0)] = True + year_start = period_break(dates_, 'year') + info_fmt = info['fmt'] + info_fmt[minute_start & (_minute % label_interval == 0)] = '%H:%M' + info_fmt[day_start] = '%H:%M\n%d-%b' + info_fmt[year_start] = '%H:%M\n%d-%b\n%Y' + + def _second_finder(label_interval): + minute_start = period_break(dates_, 'minute') + _second = dates_.second + _prev_second = (dates_-1).second + second_start = (_second - _prev_second) != 0 + info['maj'][minute_start] = True + info['min'][second_start & (_second % label_interval == 0)] = True + year_start = period_break(dates_, 'year') + info_fmt = info['fmt'] + info_fmt[second_start & (_second % label_interval == 0)] = '%H:%M:%S' + info_fmt[day_start] = '%H:%M:%S\n%d-%b' + info_fmt[year_start] = '%H:%M:%S\n%d-%b\n%Y' + + if span < periodsperday / 12000.0: _second_finder(1) + elif span < periodsperday / 6000.0: _second_finder(2) + elif span < periodsperday / 2400.0: _second_finder(5) + elif span < periodsperday / 1200.0: _second_finder(10) + elif span < periodsperday / 800.0: _second_finder(15) + elif span < periodsperday / 400.0: _second_finder(30) + elif span < periodsperday / 150.0: _minute_finder(1) + elif span < periodsperday / 70.0: _minute_finder(2) + elif span < periodsperday / 24.0: _minute_finder(5) + elif span < periodsperday / 12.0: _minute_finder(15) + elif span < periodsperday / 6.0: _minute_finder(30) + elif span < periodsperday / 2.5: _hour_finder(1, False) + elif span < periodsperday / 1.5: _hour_finder(2, False) + elif span < periodsperday * 1.25: _hour_finder(3, False) + elif span < periodsperday * 2.5: _hour_finder(6, True) + elif span < periodsperday * 4: _hour_finder(12, True) + else: + info_maj[month_start] = True + info_min[day_start] = True + year_start = period_break(dates_, 'year') + info_fmt = info['fmt'] + info_fmt[day_start] = '%d' + info_fmt[month_start] = '%d\n%b' + info_fmt[year_start] = '%d\n%b\n%Y' + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(day_start)] = '%d\n%b\n%Y' + else: + info_fmt[first_label(month_start)] = '%d\n%b\n%Y' + + # Case 2. Less than three months + elif span <= periodsperyear // 4: + month_start = period_break(dates_, 'month') + info_maj[month_start] = True + if freq < FreqGroup.FR_HR: + info['min'] = True + else: + day_start = period_break(dates_, 'day') + info['min'][day_start] = True + week_start = period_break(dates_, 'week') + year_start = period_break(dates_, 'year') + info_fmt[week_start] = '%d' + info_fmt[month_start] = '\n\n%b' + info_fmt[year_start] = '\n\n%b\n%Y' + if not has_level_label(year_start, vmin_orig): + if not has_level_label(month_start, vmin_orig): + info_fmt[first_label(week_start)] = '\n\n%b\n%Y' + else: + info_fmt[first_label(month_start)] = '\n\n%b\n%Y' + # Case 3. Less than 14 months ............... + elif span <= 1.15 * periodsperyear: + year_start = period_break(dates_, 'year') + month_start = period_break(dates_, 'month') + week_start = period_break(dates_, 'week') + info_maj[month_start] = True + info_min[week_start] = True + info_min[year_start] = False + info_min[month_start] = False + info_fmt[month_start] = '%b' + info_fmt[year_start] = '%b\n%Y' + if not has_level_label(year_start, vmin_orig): + info_fmt[first_label(month_start)] = '%b\n%Y' + # Case 4. Less than 2.5 years ............... + elif span <= 2.5 * periodsperyear: + year_start = period_break(dates_, 'year') + quarter_start = period_break(dates_, 'quarter') + month_start = period_break(dates_, 'month') + info_maj[quarter_start] = True + info_min[month_start] = True + info_fmt[quarter_start] = '%b' + info_fmt[year_start] = '%b\n%Y' + # Case 4. Less than 4 years ................. + elif span <= 4 * periodsperyear: + year_start = period_break(dates_, 'year') + month_start = period_break(dates_, 'month') + info_maj[year_start] = True + info_min[month_start] = True + info_min[year_start] = False + + month_break = dates_[month_start].month + jan_or_jul = month_start[(month_break == 1) | (month_break == 7)] + info_fmt[jan_or_jul] = '%b' + info_fmt[year_start] = '%b\n%Y' + # Case 5. Less than 11 years ................ + elif span <= 11 * periodsperyear: + year_start = period_break(dates_, 'year') + quarter_start = period_break(dates_, 'quarter') + info_maj[year_start] = True + info_min[quarter_start] = True + info_min[year_start] = False + info_fmt[year_start] = '%Y' + # Case 6. More than 12 years ................ + else: + year_start = period_break(dates_, 'year') + year_break = dates_[year_start].year + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(year_break % maj_anndef == 0)] + info_maj[major_idx] = True + minor_idx = year_start[(year_break % min_anndef == 0)] + info_min[minor_idx] = True + info_fmt[major_idx] = '%Y' + #............................................ + + return info + + +def _monthly_finder(vmin, vmax, freq): + periodsperyear = 12 + + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + #.............. + # Initialize the output + info = np.zeros(span, + dtype=[('val', int), ('maj', bool), ('min', bool), + ('fmt', '|S8')]) + info['val'] = np.arange(vmin, vmax + 1) + dates_ = info['val'] + info['fmt'] = '' + year_start = (dates_ % 12 == 0).nonzero()[0] + info_maj = info['maj'] + info_fmt = info['fmt'] + #.............. + if span <= 1.15 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + + info_fmt[:] = '%b' + info_fmt[year_start] = '%b\n%Y' + + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = '%b\n%Y' + #.............. + elif span <= 2.5 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + # TODO: Check the following : is it really info['fmt'] ? + info['fmt'][quarter_start] = True + info['min'] = True + + info_fmt[quarter_start] = '%b' + info_fmt[year_start] = '%b\n%Y' + #.............. + elif span <= 4 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + + jan_or_jul = (dates_ % 12 == 0) | (dates_ % 12 == 6) + info_fmt[jan_or_jul] = '%b' + info_fmt[year_start] = '%b\n%Y' + #.............. + elif span <= 11 * periodsperyear: + quarter_start = (dates_ % 3 == 0).nonzero() + info_maj[year_start] = True + info['min'][quarter_start] = True + + info_fmt[year_start] = '%Y' + #.................. + else: + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + years = dates_[year_start] // 12 + 1 + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info['min'][year_start[(years % min_anndef == 0)]] = True + + info_fmt[major_idx] = '%Y' + #.............. + return info + + +def _quarterly_finder(vmin, vmax, freq): + periodsperyear = 4 + vmin_orig = vmin + (vmin, vmax) = (int(vmin), int(vmax)) + span = vmax - vmin + 1 + #............................................ + info = np.zeros(span, + dtype=[('val', int), ('maj', bool), ('min', bool), + ('fmt', '|S8')]) + info['val'] = np.arange(vmin, vmax + 1) + info['fmt'] = '' + dates_ = info['val'] + info_maj = info['maj'] + info_fmt = info['fmt'] + year_start = (dates_ % 4 == 0).nonzero()[0] + #.............. + if span <= 3.5 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + + info_fmt[:] = 'Q%q' + info_fmt[year_start] = 'Q%q\n%F' + if not has_level_label(year_start, vmin_orig): + if dates_.size > 1: + idx = 1 + else: + idx = 0 + info_fmt[idx] = 'Q%q\n%F' + #.............. + elif span <= 11 * periodsperyear: + info_maj[year_start] = True + info['min'] = True + info_fmt[year_start] = '%F' + #.............. + else: + years = dates_[year_start] // 4 + 1 + nyears = span / periodsperyear + (min_anndef, maj_anndef) = _get_default_annual_spacing(nyears) + major_idx = year_start[(years % maj_anndef == 0)] + info_maj[major_idx] = True + info['min'][year_start[(years % min_anndef == 0)]] = True + info_fmt[major_idx] = '%F' + #.............. + return info + +def _annual_finder(vmin, vmax, freq): + (vmin, vmax) = (int(vmin), int(vmax + 1)) + span = vmax - vmin + 1 + #.............. + info = np.zeros(span, + dtype=[('val', int), ('maj', bool), ('min', bool), + ('fmt', '|S8')]) + info['val'] = np.arange(vmin, vmax + 1) + info['fmt'] = '' + dates_ = info['val'] + #.............. + (min_anndef, maj_anndef) = _get_default_annual_spacing(span) + major_idx = dates_ % maj_anndef == 0 + info['maj'][major_idx] = True + info['min'][(dates_ % min_anndef == 0)] = True + info['fmt'][major_idx] = '%Y' + #.............. + return info + +def get_finder(freq): + if isinstance(freq, basestring): + freq = frequencies.get_freq(freq) + fgroup = frequencies.get_freq_group(freq) + + if fgroup == FreqGroup.FR_ANN: + return _annual_finder + elif fgroup == FreqGroup.FR_QTR: + return _quarterly_finder + elif freq ==FreqGroup.FR_MTH: + return _monthly_finder + elif ((freq >= FreqGroup.FR_BUS) or (freq == FreqGroup.FR_UND) or + fgroup == FreqGroup.FR_WK): + return _daily_finder + else: # pragma: no cover + errmsg = "Unsupported frequency: %s" % (freq) + raise NotImplementedError(errmsg) + +class TimeSeries_DateLocator(Locator): + """ + Locates the ticks along an axis controlled by a :class:`Series`. + + Parameters + ---------- + freq : {var} + Valid frequency specifier. + minor_locator : {False, True}, optional + Whether the locator is for minor ticks (True) or not. + dynamic_mode : {True, False}, optional + Whether the locator should work in dynamic mode. + base : {int}, optional + quarter : {int}, optional + month : {int}, optional + day : {int}, optional + """ + + def __init__(self, freq, minor_locator=False, dynamic_mode=True, + base=1, quarter=1, month=1, day=1, plot_obj=None): + if isinstance(freq, basestring): + freq = frequencies.get_freq(freq) + self.freq = freq + self.base = base + (self.quarter, self.month, self.day) = (quarter, month, day) + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _get_default_locs(self, vmin, vmax): + "Returns the default locations of ticks." + + if self.plot_obj.date_axis_info is None: + self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) + + locator = self.plot_obj.date_axis_info + + if self.isminor: + return np.compress(locator['min'], locator['val']) + return np.compress(locator['maj'], locator['val']) + + def __call__(self): + 'Return the locations of the ticks.' + # axis calls Locator.set_axis inside set_m_formatter + vi = tuple(self.axis.get_view_interval()) + if vi != self.plot_obj.view_interval: + self.plot_obj.date_axis_info = None + self.plot_obj.view_interval = vi + vmin, vmax = vi + if vmax < vmin: + vmin, vmax = vmax, vmin + if self.isdynamic: + locs = self._get_default_locs(vmin, vmax) + else: # pragma: no cover + base = self.base + (d, m) = divmod(vmin, base) + vmin = (d + 1) * base + locs = range(vmin, vmax + 1, base) + return locs + + def autoscale(self): + """ + Sets the view limits to the nearest multiples of base that contain the + data. + """ + # requires matplotlib >= 0.98.0 + (vmin, vmax) = self.axis.get_data_interval() + + locs = self._get_default_locs(vmin, vmax) + (vmin, vmax) = locs[[0, -1]] + if vmin == vmax: + vmin -= 1 + vmax += 1 + return nonsingular(vmin, vmax) + +#####------------------------------------------------------------------------- +#---- --- Formatter --- +#####------------------------------------------------------------------------- +class TimeSeries_DateFormatter(Formatter): + """ + Formats the ticks along an axis controlled by a :class:`PeriodIndex`. + + Parameters + ---------- + freq : {int, string} + Valid frequency specifier. + minor_locator : {False, True} + Whether the current formatter should apply to minor ticks (True) or + major ticks (False). + dynamic_mode : {True, False} + Whether the formatter works in dynamic mode or not. + """ + + def __init__(self, freq, minor_locator=False, dynamic_mode=True, + plot_obj=None): + if isinstance(freq, basestring): + freq = frequencies.get_freq(freq) + self.format = None + self.freq = freq + self.locs = [] + self.formatdict = None + self.isminor = minor_locator + self.isdynamic = dynamic_mode + self.offset = 0 + self.plot_obj = plot_obj + self.finder = get_finder(freq) + + def _set_default_format(self, vmin, vmax): + "Returns the default ticks spacing." + + if self.plot_obj.date_axis_info is None: + self.plot_obj.date_axis_info = self.finder(vmin, vmax, self.freq) + info = self.plot_obj.date_axis_info + + if self.isminor: + format = np.compress(info['min'] & np.logical_not(info['maj']), + info) + else: + format = np.compress(info['maj'], info) + self.formatdict = dict([(x, f) for (x, _, _, f) in format]) + return self.formatdict + + def set_locs(self, locs): + 'Sets the locations of the ticks' + # don't actually use the locs. This is just needed to work with + # matplotlib. Force to use vmin, vmax + self.locs = locs + + (vmin, vmax) = vi = tuple(self.axis.get_view_interval()) + if vi != self.plot_obj.view_interval: + self.plot_obj.date_axis_info = None + self.plot_obj.view_interval = vi + if vmax < vmin: + (vmin, vmax) = (vmax, vmin) + self._set_default_format(vmin, vmax) + # + def __call__(self, x, pos=0): + if self.formatdict is None: + return '' + else: + fmt = self.formatdict.pop(x, '') + return Period(ordinal=int(x), freq=self.freq).strftime(fmt) + + diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py new file mode 100644 index 00000000..98f9fbe3 --- /dev/null +++ b/pandas/tseries/frequencies.py @@ -0,0 +1,1025 @@ +from datetime import datetime +import re + +import numpy as np + +from pandas.tseries.offsets import DateOffset +from pandas.util.decorators import cache_readonly +import pandas.tseries.offsets as offsets +import pandas.core.common as com +import pandas.lib as lib + +class FreqGroup(object): + FR_ANN = 1000 + FR_QTR = 2000 + FR_MTH = 3000 + FR_WK = 4000 + FR_BUS = 5000 + FR_DAY = 6000 + FR_HR = 7000 + FR_MIN = 8000 + FR_SEC = 9000 + FR_UND = -10000 + +def get_freq_group(freq): + if isinstance(freq, basestring): + base, mult = get_freq_code(freq) + freq = base + return (freq // 1000) * 1000 + +def get_freq(freq): + if isinstance(freq, basestring): + base, mult = get_freq_code(freq) + freq = base + return freq + +def get_freq_code(freqstr): + """ + + Parameters + ---------- + + Returns + ------- + """ + if isinstance(freqstr, DateOffset): + freqstr = (get_offset_name(freqstr), freqstr.n) + + if isinstance(freqstr, tuple): + if (com.is_integer(freqstr[0]) and + com.is_integer(freqstr[1])): + #e.g., freqstr = (2000, 1) + return freqstr + else: + #e.g., freqstr = ('T', 5) + try: + code = _period_str_to_code(freqstr[0]) + stride = freqstr[1] + except: + code = _period_str_to_code(freqstr[1]) + stride = freqstr[0] + return code, stride + + if com.is_integer(freqstr): + return (freqstr, 1) + + base, stride = _base_and_stride(freqstr) + code = _period_str_to_code(base) + + return code, stride + + +def _get_freq_str(base, mult=1): + code = _reverse_period_code_map.get(base) + if mult == 1: + return code + return str(mult) + code + + +#---------------------------------------------------------------------- +# Offset names ("time rules") and related functions + + +from pandas.tseries.offsets import (Day, BDay, Hour, Minute, Second, Milli, + Week, Micro, MonthEnd, MonthBegin, + BMonthBegin, BMonthEnd, YearBegin, YearEnd, + BYearBegin, BYearEnd, QuarterBegin, + QuarterEnd, BQuarterBegin, BQuarterEnd) + +_offset_map = { + 'D' : Day(), + 'B' : BDay(), + 'H' : Hour(), + 'T' : Minute(), + 'S' : Second(), + 'L' : Milli(), + 'U' : Micro(), + None : None, + + # Monthly - Calendar + 'M' : MonthEnd(), + 'MS' : MonthBegin(), + + # Monthly - Business + 'BM' : BMonthEnd(), + 'BMS' : BMonthBegin(), + + # Annual - Calendar + 'A-JAN' : YearEnd(month=1), + 'A-FEB' : YearEnd(month=2), + 'A-MAR' : YearEnd(month=3), + 'A-APR' : YearEnd(month=4), + 'A-MAY' : YearEnd(month=5), + 'A-JUN' : YearEnd(month=6), + 'A-JUL' : YearEnd(month=7), + 'A-AUG' : YearEnd(month=8), + 'A-SEP' : YearEnd(month=9), + 'A-OCT' : YearEnd(month=10), + 'A-NOV' : YearEnd(month=11), + 'A-DEC' : YearEnd(month=12), + 'A' : YearEnd(month=12), + + # Annual - Calendar (start) + 'AS-JAN' : YearBegin(month=1), + 'AS' : YearBegin(month=1), + 'AS-FEB' : YearBegin(month=2), + 'AS-MAR' : YearBegin(month=3), + 'AS-APR' : YearBegin(month=4), + 'AS-MAY' : YearBegin(month=5), + 'AS-JUN' : YearBegin(month=6), + 'AS-JUL' : YearBegin(month=7), + 'AS-AUG' : YearBegin(month=8), + 'AS-SEP' : YearBegin(month=9), + 'AS-OCT' : YearBegin(month=10), + 'AS-NOV' : YearBegin(month=11), + 'AS-DEC' : YearBegin(month=12), + + # Annual - Business + 'BA-JAN' : BYearEnd(month=1), + 'BA-FEB' : BYearEnd(month=2), + 'BA-MAR' : BYearEnd(month=3), + 'BA-APR' : BYearEnd(month=4), + 'BA-MAY' : BYearEnd(month=5), + 'BA-JUN' : BYearEnd(month=6), + 'BA-JUL' : BYearEnd(month=7), + 'BA-AUG' : BYearEnd(month=8), + 'BA-SEP' : BYearEnd(month=9), + 'BA-OCT' : BYearEnd(month=10), + 'BA-NOV' : BYearEnd(month=11), + 'BA-DEC' : BYearEnd(month=12), + 'BA' : BYearEnd(month=12), + + # Annual - Business (Start) + 'BAS-JAN' : BYearBegin(month=1), + 'BAS' : BYearBegin(month=1), + 'BAS-FEB' : BYearBegin(month=2), + 'BAS-MAR' : BYearBegin(month=3), + 'BAS-APR' : BYearBegin(month=4), + 'BAS-MAY' : BYearBegin(month=5), + 'BAS-JUN' : BYearBegin(month=6), + 'BAS-JUL' : BYearBegin(month=7), + 'BAS-AUG' : BYearBegin(month=8), + 'BAS-SEP' : BYearBegin(month=9), + 'BAS-OCT' : BYearBegin(month=10), + 'BAS-NOV' : BYearBegin(month=11), + 'BAS-DEC' : BYearBegin(month=12), + + # Quarterly - Calendar + # 'Q' : QuarterEnd(startingMonth=3), + 'Q-JAN' : QuarterEnd(startingMonth=1), + 'Q-FEB' : QuarterEnd(startingMonth=2), + 'Q-MAR' : QuarterEnd(startingMonth=3), + 'Q-APR' : QuarterEnd(startingMonth=4), + 'Q-MAY' : QuarterEnd(startingMonth=5), + 'Q-JUN' : QuarterEnd(startingMonth=6), + 'Q-JUL' : QuarterEnd(startingMonth=7), + 'Q-AUG' : QuarterEnd(startingMonth=8), + 'Q-SEP' : QuarterEnd(startingMonth=9), + 'Q-OCT' : QuarterEnd(startingMonth=10), + 'Q-NOV' : QuarterEnd(startingMonth=11), + 'Q-DEC' : QuarterEnd(startingMonth=12), + + # Quarterly - Calendar (Start) + # 'QS' : QuarterBegin(startingMonth=1), + 'QS-JAN' : QuarterBegin(startingMonth=1), + 'QS-FEB' : QuarterBegin(startingMonth=2), + 'QS-MAR' : QuarterBegin(startingMonth=3), + 'QS-APR' : QuarterBegin(startingMonth=4), + 'QS-MAY' : QuarterBegin(startingMonth=5), + 'QS-JUN' : QuarterBegin(startingMonth=6), + 'QS-JUL' : QuarterBegin(startingMonth=7), + 'QS-AUG' : QuarterBegin(startingMonth=8), + 'QS-SEP' : QuarterBegin(startingMonth=9), + 'QS-OCT' : QuarterBegin(startingMonth=10), + 'QS-NOV' : QuarterBegin(startingMonth=11), + 'QS-DEC' : QuarterBegin(startingMonth=12), + + # Quarterly - Business + 'BQ-JAN' : BQuarterEnd(startingMonth=1), + 'BQ-FEB' : BQuarterEnd(startingMonth=2), + 'BQ-MAR' : BQuarterEnd(startingMonth=3), + + # 'BQ' : BQuarterEnd(startingMonth=3), + 'BQ-APR' : BQuarterEnd(startingMonth=4), + 'BQ-MAY' : BQuarterEnd(startingMonth=5), + 'BQ-JUN' : BQuarterEnd(startingMonth=6), + 'BQ-JUL' : BQuarterEnd(startingMonth=7), + 'BQ-AUG' : BQuarterEnd(startingMonth=8), + 'BQ-SEP' : BQuarterEnd(startingMonth=9), + 'BQ-OCT' : BQuarterEnd(startingMonth=10), + 'BQ-NOV' : BQuarterEnd(startingMonth=11), + 'BQ-DEC' : BQuarterEnd(startingMonth=12), + + # Quarterly - Business (Start) + 'BQS-JAN' : BQuarterBegin(startingMonth=1), + 'BQS' : BQuarterBegin(startingMonth=1), + 'BQS-FEB' : BQuarterBegin(startingMonth=2), + 'BQS-MAR' : BQuarterBegin(startingMonth=3), + 'BQS-APR' : BQuarterBegin(startingMonth=4), + 'BQS-MAY' : BQuarterBegin(startingMonth=5), + 'BQS-JUN' : BQuarterBegin(startingMonth=6), + 'BQS-JUL' : BQuarterBegin(startingMonth=7), + 'BQS-AUG' : BQuarterBegin(startingMonth=8), + 'BQS-SEP' : BQuarterBegin(startingMonth=9), + 'BQS-OCT' : BQuarterBegin(startingMonth=10), + 'BQS-NOV' : BQuarterBegin(startingMonth=11), + 'BQS-DEC' : BQuarterBegin(startingMonth=12), + + # Weekly + 'W-MON' : Week(weekday=0), + 'W-TUE' : Week(weekday=1), + 'W-WED' : Week(weekday=2), + 'W-THU' : Week(weekday=3), + 'W-FRI' : Week(weekday=4), + 'W-SAT' : Week(weekday=5), + 'W-SUN' : Week(weekday=6), + +} + +_offset_to_period_map = { + 'WEEKDAY' : 'D', + 'EOM' : 'M', + 'BM' : 'M', + 'BQS' : 'Q', + 'QS' : 'Q', + 'BQ' : 'Q', + 'BA' : 'A', + 'AS' : 'A', + 'BAS' : 'A', + 'MS' : 'M', + 'D' : 'D', + 'B' : 'B', + 'T' : 'T', + 'S' : 'S', + 'H' : 'H', + 'Q' : 'Q', + 'A' : 'A', + 'W' : 'W', + 'M' : 'M' +} + +need_suffix = ['QS', 'BQ', 'BQS', 'AS', 'BA', 'BAS'] +_months = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] +for __prefix in need_suffix: + for _m in _months: + _offset_to_period_map['%s-%s' % (__prefix, _m)] = \ + _offset_to_period_map[__prefix] +for __prefix in ['A', 'Q']: + for _m in _months: + _alias = '%s-%s' % (__prefix, _m) + _offset_to_period_map[_alias] = _alias + +_days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] +for _d in _days: + _offset_to_period_map['W-%s' % _d] = 'W-%s' % _d + +def get_period_alias(offset_str): + """ alias to closest period strings BQ->Q etc""" + return _offset_to_period_map.get(offset_str, None) + +_rule_aliases = { + # Legacy rules that will continue to map to their original values + # essentially for the rest of time + + 'WEEKDAY': 'B', + 'EOM': 'BM', + + 'W@MON': 'W-MON', + 'W@TUE': 'W-TUE', + 'W@WED': 'W-WED', + 'W@THU': 'W-THU', + 'W@FRI': 'W-FRI', + 'W@SAT': 'W-SAT', + 'W@SUN': 'W-SUN', + 'W': 'W-SUN', + + 'Q@JAN': 'BQ-JAN', + 'Q@FEB': 'BQ-FEB', + 'Q@MAR': 'BQ-MAR', + 'Q' : 'Q-DEC', + + 'A@JAN' : 'BA-JAN', + 'A@FEB' : 'BA-FEB', + 'A@MAR' : 'BA-MAR', + 'A@APR' : 'BA-APR', + 'A@MAY' : 'BA-MAY', + 'A@JUN' : 'BA-JUN', + 'A@JUL' : 'BA-JUL', + 'A@AUG' : 'BA-AUG', + 'A@SEP' : 'BA-SEP', + 'A@OCT' : 'BA-OCT', + 'A@NOV' : 'BA-NOV', + 'A@DEC' : 'BA-DEC', + + # lite aliases + 'Min': 'T', + 'min': 'T', + 'ms': 'L', + 'us': 'U' +} + +for _i, _weekday in enumerate(['MON', 'TUE', 'WED', 'THU', 'FRI']): + for _iweek in xrange(4): + _name = 'WOM-%d%s' % (_iweek + 1, _weekday) + _offset_map[_name] = offsets.WeekOfMonth(week=_iweek, weekday=_i) + _rule_aliases[_name.replace('-', '@')] = _name + +_legacy_reverse_map = dict((v, k) for k, v in _rule_aliases.iteritems()) + +# for helping out with pretty-printing and name-lookups + +_offset_names = {} +for name, offset in _offset_map.iteritems(): + if offset is None: + continue + offset.name = name + _offset_names[offset] = name + + +def inferTimeRule(index): + from pandas.tseries.index import DatetimeIndex + import warnings + warnings.warn("This method is deprecated, use infer_freq or inferred_freq" + " attribute of DatetimeIndex", FutureWarning) + + freq = DatetimeIndex(index).inferred_freq + if freq is None: + raise Exception('Unable to infer time rule') + + offset = to_offset(freq) + return get_legacy_offset_name(offset) + + +def to_offset(freqstr): + """ + Return DateOffset object from string representation + + Example + ------- + to_offset('5Min') -> Minute(5) + """ + if freqstr is None: + return None + + if isinstance(freqstr, DateOffset): + return freqstr + + if isinstance(freqstr, tuple): + name = freqstr[0] + stride = freqstr[1] + if isinstance(stride, basestring): + name, stride = stride, name + name, _ = _base_and_stride(name) + delta = get_offset(name) * stride + else: + delta = None + stride_sign = None + try: + for stride, name, _ in opattern.findall(freqstr): + offset = get_offset(name) + if not stride: + stride = 1 + stride = int(stride) + if stride_sign is None: + stride_sign = np.sign(stride) + offset = offset * int(np.fabs(stride) * stride_sign) + if delta is None: + delta = offset + else: + delta = delta + offset + except Exception: + raise ValueError("Could not evaluate %s" % freqstr) + + if delta is None: + raise ValueError('Unable to understand %s as a frequency' % freqstr) + + return delta + + +# hack to handle WOM-1MON +opattern = re.compile(r'([\-]?\d*)\s*([A-Za-z]+([\-@]\d*[A-Za-z]+)?)') + +def _base_and_stride(freqstr): + """ + Return base freq and stride info from string representation + + Example + ------- + _freq_and_stride('5Min') -> 'Min', 5 + """ + groups = opattern.match(freqstr) + + if not groups: + raise ValueError("Could not evaluate %s" % freqstr) + + stride = groups.group(1) + + if len(stride): + stride = int(stride) + else: + stride = 1 + + base = groups.group(2) + + return (base, stride) + +def get_base_alias(freqstr): + """ + Returns the base frequency alias, e.g., '5D' -> 'D' + """ + return _base_and_stride(freqstr)[0] + +_dont_uppercase = ['MS', 'ms'] + + +def get_offset(name): + """ + Return DateOffset object associated with rule name + + Example + ------- + get_offset('EOM') --> BMonthEnd(1) + """ + if name not in _dont_uppercase: + name = name.upper() + + if name in _rule_aliases: + name = _rule_aliases[name] + elif name.lower() in _rule_aliases: + name = _rule_aliases[name.lower()] + else: + if name in _rule_aliases: + name = _rule_aliases[name] + + offset = _offset_map.get(name) + + if offset is not None: + return offset + else: + raise Exception('Bad rule name requested: %s!' % name) + + +getOffset = get_offset + + +def hasOffsetName(offset): + return offset in _offset_names + +def get_offset_name(offset): + """ + Return rule name associated with a DateOffset object + + Example + ------- + get_offset_name(BMonthEnd(1)) --> 'EOM' + """ + name = _offset_names.get(offset) + + if name is not None: + return name + else: + raise Exception('Bad rule given: %s!' % offset) + +def get_legacy_offset_name(offset): + """ + Return the pre pandas 0.8.0 name for the date offset + """ + name = _offset_names.get(offset) + return _legacy_reverse_map.get(name, name) + +get_offset_name = get_offset_name + +def get_standard_freq(freq): + """ + Return the standardized frequency string + """ + if freq is None: + return None + + if isinstance(freq, DateOffset): + return get_offset_name(freq) + + code, stride = get_freq_code(freq) + return _get_freq_str(code, stride) + +#---------------------------------------------------------------------- +# Period codes + +# period frequency constants corresponding to scikits timeseries +# originals +_period_code_map = { + # Annual freqs with various fiscal year ends. + # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 + "A-DEC" : 1000, # Annual - December year end + "A-JAN" : 1001, # Annual - January year end + "A-FEB" : 1002, # Annual - February year end + "A-MAR" : 1003, # Annual - March year end + "A-APR" : 1004, # Annual - April year end + "A-MAY" : 1005, # Annual - May year end + "A-JUN" : 1006, # Annual - June year end + "A-JUL" : 1007, # Annual - July year end + "A-AUG" : 1008, # Annual - August year end + "A-SEP" : 1009, # Annual - September year end + "A-OCT" : 1010, # Annual - October year end + "A-NOV" : 1011, # Annual - November year end + + # Quarterly frequencies with various fiscal year ends. + # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 + "Q-DEC" : 2000 , # Quarterly - December year end + "Q-JAN" : 2001, # Quarterly - January year end + "Q-FEB" : 2002, # Quarterly - February year end + "Q-MAR" : 2003, # Quarterly - March year end + "Q-APR" : 2004, # Quarterly - April year end + "Q-MAY" : 2005, # Quarterly - May year end + "Q-JUN" : 2006, # Quarterly - June year end + "Q-JUL" : 2007, # Quarterly - July year end + "Q-AUG" : 2008, # Quarterly - August year end + "Q-SEP" : 2009, # Quarterly - September year end + "Q-OCT" : 2010, # Quarterly - October year end + "Q-NOV" : 2011, # Quarterly - November year end + + "M" : 3000, # Monthly + + "W-SUN" : 4000, # Weekly - Sunday end of week + "W-MON" : 4001, # Weekly - Monday end of week + "W-TUE" : 4002, # Weekly - Tuesday end of week + "W-WED" : 4003, # Weekly - Wednesday end of week + "W-THU" : 4004, # Weekly - Thursday end of week + "W-FRI" : 4005, # Weekly - Friday end of week + "W-SAT" : 4006, # Weekly - Saturday end of week + + "B" : 5000, # Business days + "D" : 6000, # Daily + "H" : 7000, # Hourly + "T" : 8000, # Minutely + "S" : 9000, # Secondly + None : -10000 # Undefined + +} + +_reverse_period_code_map = {} +for _k, _v in _period_code_map.iteritems(): + _reverse_period_code_map[_v] = _k + +# Additional aliases +_period_code_map.update({ + "Q" : 2000, # Quarterly - December year end (default quarterly) + "A" : 1000, # Annual + "W" : 4000, # Weekly +}) + +def _period_alias_dictionary(): + """ + Build freq alias dictionary to support freqs from original c_dates.c file + of the scikits.timeseries library. + """ + alias_dict = {} + + M_aliases = ["M", "MTH", "MONTH", "MONTHLY"] + B_aliases = ["B", "BUS", "BUSINESS", "BUSINESSLY", 'WEEKDAY'] + D_aliases = ["D", "DAY", "DLY", "DAILY"] + H_aliases = ["H", "HR", "HOUR", "HRLY", "HOURLY"] + T_aliases = ["T", "MIN", "MINUTE", "MINUTELY"] + S_aliases = ["S", "SEC", "SECOND", "SECONDLY"] + U_aliases = ["U", "UND", "UNDEF", "UNDEFINED"] + + for k in M_aliases: + alias_dict[k] = 'M' + + for k in B_aliases: + alias_dict[k] = 'B' + + for k in D_aliases: + alias_dict[k] = 'D' + + for k in H_aliases: + alias_dict[k] = 'H' + + for k in T_aliases: + alias_dict[k] = 'Min' + + for k in S_aliases: + alias_dict[k] = 'S' + + for k in U_aliases: + alias_dict[k] = None + + A_prefixes = ["A", "Y", "ANN", "ANNUAL", "ANNUALLY", "YR", "YEAR", + "YEARLY"] + + Q_prefixes = ["Q", "QTR", "QUARTER", "QUARTERLY", "Q-E", + "QTR-E", "QUARTER-E", "QUARTERLY-E"] + + month_names = [ + [ "DEC", "DECEMBER" ], + [ "JAN", "JANUARY" ], + [ "FEB", "FEBRUARY" ], + [ "MAR", "MARCH" ], + [ "APR", "APRIL" ], + [ "MAY", "MAY" ], + [ "JUN", "JUNE" ], + [ "JUL", "JULY" ], + [ "AUG", "AUGUST" ], + [ "SEP", "SEPTEMBER" ], + [ "OCT", "OCTOBER" ], + [ "NOV", "NOVEMBER" ] ] + + seps = ["@", "-"] + + for k in A_prefixes: + alias_dict[k] = 'A' + for m_tup in month_names: + for sep in seps: + m1, m2 = m_tup + alias_dict[k + sep + m1] = 'A-' + m1 + alias_dict[k + sep + m2] = 'A-' + m1 + + for k in Q_prefixes: + alias_dict[k] = 'Q' + for m_tup in month_names: + for sep in seps: + m1, m2 = m_tup + alias_dict[k + sep + m1] = 'Q-' + m1 + alias_dict[k + sep + m2] = 'Q-' + m1 + + W_prefixes = ["W", "WK", "WEEK", "WEEKLY"] + + day_names = [ + [ "SUN", "SUNDAY" ], + [ "MON", "MONDAY" ], + [ "TUE", "TUESDAY" ], + [ "WED", "WEDNESDAY" ], + [ "THU", "THURSDAY" ], + [ "FRI", "FRIDAY" ], + [ "SAT", "SATURDAY" ] ] + + for k in W_prefixes: + alias_dict[k] = 'W' + for d_tup in day_names: + for sep in ["@", "-"]: + d1, d2 = d_tup + alias_dict[k + sep + d1] = 'W-' + d1 + alias_dict[k + sep + d2] = 'W-' + d1 + + return alias_dict + +_reso_period_map = { + "year" : "A", + "quarter" : "Q", + "month" : "M", + "day" : "D", + "hour" : "H", + "minute" : "T", + "second" : "S", +} + +def _infer_period_group(freqstr): + return _period_group(_reso_period_map[freqstr]) + +def _period_group(freqstr): + base, mult = get_freq_code(freqstr) + return base // 1000 * 1000 + +_period_alias_dict = _period_alias_dictionary() + +def _period_str_to_code(freqstr): + # hack + freqstr = _rule_aliases.get(freqstr, freqstr) + freqstr = _rule_aliases.get(freqstr.lower(), freqstr) + + try: + freqstr = freqstr.upper() + return _period_code_map[freqstr] + except: + alias = _period_alias_dict[freqstr] + return _period_code_map[alias] + + + +def infer_freq(index, warn=True): + """ + Infer the most likely frequency given the input index. If the frequency is + uncertain, a warning will be printed + + Parameters + ---------- + index : DatetimeIndex + + Returns + ------- + freq : string or None + None if no discernable frequency + """ + from pandas.tseries.index import DatetimeIndex + + if not isinstance(index, DatetimeIndex): + index = DatetimeIndex(index) + + inferer = _FrequencyInferer(index, warn=warn) + return inferer.get_freq() + +_ONE_MICRO = 1000L +_ONE_MILLI = _ONE_MICRO * 1000 +_ONE_SECOND = _ONE_MILLI * 1000 +_ONE_MINUTE = 60 * _ONE_SECOND +_ONE_HOUR = 60 * _ONE_MINUTE +_ONE_DAY = 24 * _ONE_HOUR + +class _FrequencyInferer(object): + """ + Not sure if I can avoid the state machine here + """ + + def __init__(self, index, warn=True): + self.index = index + self.values = np.asarray(index).view('i8') + self.warn = warn + + if len(index) < 3: + raise ValueError('Need at least 3 dates to infer frequency') + + self.deltas = lib.unique_deltas(self.values) + self.is_unique = len(self.deltas) == 1 + self.is_monotonic = self.index.is_monotonic + + def get_freq(self): + if not self.is_monotonic: + return None + + delta = self.deltas[0] + if _is_multiple(delta, _ONE_DAY): + return self._infer_daily_rule() + else: + # Possibly intraday frequency + if not self.is_unique: + return None + if _is_multiple(delta, _ONE_HOUR): + # Hours + return _maybe_add_count('H', delta / _ONE_HOUR) + elif _is_multiple(delta, _ONE_MINUTE): + # Minutes + return _maybe_add_count('T', delta / _ONE_MINUTE) + elif _is_multiple(delta, _ONE_SECOND): + # Seconds + return _maybe_add_count('S', delta / _ONE_SECOND) + elif _is_multiple(delta, _ONE_MILLI): + # Milliseconds + return _maybe_add_count('L', delta / _ONE_MILLI) + elif _is_multiple(delta, _ONE_MICRO): + # Microseconds + return _maybe_add_count('U', delta / _ONE_MICRO) + else: + # Nanoseconds + return _maybe_add_count('N', delta) + + @cache_readonly + def day_deltas(self): + return [x / _ONE_DAY for x in self.deltas] + + @cache_readonly + def fields(self): + return lib.build_field_sarray(self.values) + + @cache_readonly + def rep_stamp(self): + return lib.Timestamp(self.values[0]) + + def month_position_check(self): + # TODO: cythonize this, very slow + calendar_end = True + business_end = True + calendar_start = True + business_start = True + + years = self.fields['Y'] + months = self.fields['M'] + days = self.fields['D'] + weekdays = self.index.dayofweek + + from calendar import monthrange + for y, m, d, wd in zip(years, months, days, weekdays): + wd = datetime(y, m, d).weekday() + + if calendar_start: + calendar_start &= d == 1 + if business_start: + business_start &= d == 1 or (d <= 3 and wd == 0) + + _, daysinmonth = monthrange(y, m) + cal = d == daysinmonth + if calendar_end: + calendar_end &= cal + if business_end: + business_end &= cal or (daysinmonth - d < 3 and wd == 4) + + if calendar_end: + return 'ce' + elif business_end: + return 'be' + elif calendar_start: + return 'cs' + elif business_start: + return 'bs' + else: + return None + + @cache_readonly + def mdiffs(self): + nmonths = self.fields['Y'] * 12 + self.fields['M'] + return lib.unique_deltas(nmonths.astype('i8')) + + @cache_readonly + def ydiffs(self): + return lib.unique_deltas(self.fields['Y'].astype('i8')) + + def _infer_daily_rule(self): + annual_rule = self._get_annual_rule() + if annual_rule: + nyears = self.ydiffs[0] + month = _month_aliases[self.rep_stamp.month] + return _maybe_add_count('%s-%s' % (annual_rule, month), nyears) + + quarterly_rule = self._get_quarterly_rule() + if quarterly_rule: + nquarters = self.mdiffs[0] / 3 + month = _month_aliases[self.rep_stamp.month] + return _maybe_add_count('%s-%s' % (quarterly_rule, month), + nquarters) + + monthly_rule = self._get_monthly_rule() + if monthly_rule: + return monthly_rule + + if self.is_unique: + days = self.deltas[0] / _ONE_DAY + if days % 7 == 0: + # Weekly + alias = _weekday_rule_aliases[self.rep_stamp.weekday()] + return _maybe_add_count('W-%s' % alias, days / 7) + else: + return _maybe_add_count('D', days) + + # Business daily. Maybe + if self.day_deltas == [1, 3]: + return 'B' + + def _get_annual_rule(self): + if len(self.ydiffs) > 1: + return None + + if len(algos.unique(self.fields['M'])) > 1: + return None + + pos_check = self.month_position_check() + return {'cs': 'AS', 'bs': 'BAS', + 'ce': 'A', 'be': 'BA'}.get(pos_check) + + def _get_quarterly_rule(self): + if len(self.mdiffs) > 1: + return None + + if not self.mdiffs[0] % 3 == 0: + return None + + pos_check = self.month_position_check() + return {'cs': 'QS', 'bs': 'BQS', + 'ce': 'Q', 'be': 'BQ'}.get(pos_check) + + def _get_monthly_rule(self): + if len(self.mdiffs) > 1: + return None + pos_check = self.month_position_check() + return {'cs': 'MS', 'bs': 'BMS', + 'ce': 'M', 'be': 'BM'}.get(pos_check) + + +import pandas.core.algorithms as algos + +def _maybe_add_count(base, count): + if count > 1: + return '%d%s' % (count, base) + else: + return base + +def is_subperiod(source, target): + """ + Returns True if downsampling is possible between source and target + frequencies + + Parameters + ---------- + source : string + Frequency converting from + target : string + Frequency converting to + + Returns + ------- + is_subperiod : boolean + """ + if isinstance(source, offsets.DateOffset): + source = source.rule_code + + if isinstance(target, offsets.DateOffset): + target = target.rule_code + + target = target.upper() + source = source.upper() + if _is_annual(target): + return source in ['D', 'B', 'M', 'H', 'T', 'S'] + elif _is_quarterly(target): + return source in ['D', 'B', 'M', 'H', 'T', 'S'] + elif target == 'M': + return source in ['D', 'B', 'H', 'T', 'S'] + elif _is_weekly(target): + return source in [target, 'D', 'B', 'H', 'T', 'S'] + elif target == 'B': + return source in ['B', 'H', 'T', 'S'] + elif target == 'D': + return source in ['D', 'H', 'T', 'S'] + +def is_superperiod(source, target): + """ + Returns True if upsampling is possible between source and target + frequencies + + Parameters + ---------- + source : string + Frequency converting from + target : string + Frequency converting to + + Returns + ------- + is_superperiod : boolean + """ + if isinstance(source, offsets.DateOffset): + source = source.rule_code + + if isinstance(target, offsets.DateOffset): + target = target.rule_code + + target = target.upper() + source = source.upper() + if _is_annual(source): + if _is_annual(target): + return _get_rule_month(source) == _get_rule_month(target) + + if _is_quarterly(target): + smonth = _get_rule_month(source) + tmonth = _get_rule_month(target) + return _quarter_months_conform(smonth, tmonth) + return target in ['D', 'B', 'M', 'H', 'T', 'S'] + elif _is_quarterly(source): + return target in ['D', 'B', 'M', 'H', 'T', 'S'] + elif source == 'M': + return target in ['D', 'B', 'H', 'T', 'S'] + elif _is_weekly(source): + return target in [source, 'D', 'B', 'H', 'T', 'S'] + elif source == 'B': + return target in ['D', 'B', 'H', 'T', 'S'] + elif source == 'D': + return target in ['D', 'B', 'H', 'T', 'S'] + +def _get_rule_month(source, default='DEC'): + source = source.upper() + if '-' not in source: + return default + else: + return source.split('-')[1] + +def _is_annual(rule): + rule = rule.upper() + return rule == 'A' or rule.startswith('A-') + +def _quarter_months_conform(source, target): + snum = _month_numbers[source] + tnum = _month_numbers[target] + return snum % 3 == tnum % 3 + +def _is_quarterly(rule): + rule = rule.upper() + return rule == 'Q' or rule.startswith('Q-') + + +def _is_weekly(rule): + rule = rule.upper() + return rule == 'W' or rule.startswith('W-') + + +DAYS = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + +MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', + 'AUG', 'SEP', 'OCT', 'NOV', 'DEC'] + +_month_numbers = dict((k, i) for i, k in enumerate(MONTHS)) + + + +_weekday_rule_aliases = dict((k, v) for k, v in enumerate(DAYS)) +_month_aliases = dict((k + 1, v) for k, v in enumerate(MONTHS)) + +def _is_multiple(us, mult): + return us % mult == 0 diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py new file mode 100644 index 00000000..ae212902 --- /dev/null +++ b/pandas/tseries/index.py @@ -0,0 +1,1373 @@ +# pylint: disable=E1101 + +from datetime import time, datetime +from datetime import timedelta + +import numpy as np + +from pandas.core.common import isnull +from pandas.core.index import Index, Int64Index +from pandas.tseries.frequencies import infer_freq, to_offset +from pandas.tseries.offsets import DateOffset, generate_range, Tick +from pandas.tseries.tools import parse_time_string, normalize_date +from pandas.util.decorators import cache_readonly +import pandas.core.common as com +import pandas.tseries.offsets as offsets +import pandas.tseries.tools as tools + +from pandas.lib import Timestamp +import pandas.lib as lib +import pandas._algos as _algos + +def _utc(): + import pytz + return pytz.utc + +# -------- some conversion wrapper functions + + +def _field_accessor(name, field): + def f(self): + values = self.asi8 + if self.tz is not None: + utc = _utc() + if self.tz is not utc: + values = lib.tz_convert(values, utc, self.tz) + return lib.get_date_field(values, field) + f.__name__ = name + return property(f) + + +def _join_i8_wrapper(joinf, with_indexers=True): + @staticmethod + def wrapper(left, right): + if isinstance(left, np.ndarray): + left = left.view('i8', type=np.ndarray) + if isinstance(right, np.ndarray): + right = right.view('i8', type=np.ndarray) + results = joinf(left, right) + if with_indexers: + join_index, left_indexer, right_indexer = results + join_index = join_index.view('M8[ns]') + return join_index, left_indexer, right_indexer + return results + return wrapper + + +def _dt_index_cmp(opname): + """ + Wrap comparison operations to convert datetime-like to datetime64 + """ + def wrapper(self, other): + func = getattr(super(DatetimeIndex, self), opname) + if isinstance(other, datetime): + func = getattr(self, opname) + other = _to_m8(other) + elif isinstance(other, list): + other = DatetimeIndex(other) + elif not isinstance(other, np.ndarray): + other = _ensure_datetime64(other) + result = func(other) + + return result.view(np.ndarray) + + return wrapper + +def _ensure_datetime64(other): + if isinstance(other, np.datetime64): + return other + raise TypeError('%s type object %s' % (type(other), str(other))) + + +class TimeSeriesError(Exception): + pass + + +_midnight = time(0, 0) +_NS_DTYPE = np.dtype('M8[ns]') +_INT64_DTYPE = np.dtype(np.int64) + +class DatetimeIndex(Int64Index): + """ + Immutable ndarray of datetime64 data, represented internally as int64, and + which can be boxed to Timestamp objects that are subclasses of datetime and + carry metadata such as frequency information. + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional datetime-like data to construct index with + copy : bool + Make a copy of input ndarray + freq : string or pandas offset object, optional + One of pandas date offset strings or corresponding objects + start : starting value, datetime-like, optional + If data is None, start is used as the start point in generating regular + timestamp data. + periods : int, optional, > 0 + Number of periods to generate, if generating index. Takes precedence + over end argument + end : end time, datetime-like, optional + If periods is none, generated index will extend to first conforming + time on or just past end argument + """ + _join_precedence = 10 + + _inner_indexer = _join_i8_wrapper(_algos.inner_join_indexer_int64) + _outer_indexer = _join_i8_wrapper(_algos.outer_join_indexer_int64) + _left_indexer = _join_i8_wrapper(_algos.left_join_indexer_int64) + _left_indexer_unique = _join_i8_wrapper( + _algos.left_join_indexer_unique_int64, with_indexers=False) + _arrmap = None + + __eq__ = _dt_index_cmp('__eq__') + __ne__ = _dt_index_cmp('__ne__') + __lt__ = _dt_index_cmp('__lt__') + __gt__ = _dt_index_cmp('__gt__') + __le__ = _dt_index_cmp('__le__') + __ge__ = _dt_index_cmp('__ge__') + + # structured array cache for datetime fields + _sarr_cache = None + + _engine_type = lib.DatetimeEngine + + offset = None + + def __new__(cls, data=None, + freq=None, start=None, end=None, periods=None, + copy=False, name=None, tz=None, + verify_integrity=True, normalize=False, **kwds): + + warn = False + if 'offset' in kwds and kwds['offset']: + freq = kwds['offset'] + warn = True + + infer_freq = False + if not isinstance(freq, DateOffset): + if freq != 'infer': + freq = to_offset(freq) + else: + infer_freq = True + freq = None + + if warn: + import warnings + warnings.warn("parameter 'offset' is deprecated, " + "please use 'freq' instead", + FutureWarning) + + offset = freq + + if periods is not None: + if com.is_float(periods): + periods = int(periods) + elif not com.is_integer(periods): + raise ValueError('Periods must be a number, got %s' % + str(periods)) + + if data is None and offset is None: + raise ValueError("Must provide freq argument if no data is " + "supplied") + + if data is None: + return cls._generate(start, end, periods, name, offset, + tz=tz, normalize=normalize) + + if not isinstance(data, np.ndarray): + if np.isscalar(data): + raise ValueError('DatetimeIndex() must be called with a ' + 'collection of some kind, %s was passed' + % repr(data)) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + data = np.asarray(data, dtype='O') + + # try a few ways to make it datetime64 + if lib.is_string_array(data): + data = _str_to_dt_array(data, offset) + else: + data = tools.to_datetime(data) + data.offset = offset + + if issubclass(data.dtype.type, basestring): + subarr = _str_to_dt_array(data, offset) + elif issubclass(data.dtype.type, np.datetime64): + if isinstance(data, DatetimeIndex): + subarr = data.values + if offset is None: + offset = data.offset + verify_integrity = False + else: + if data.dtype != _NS_DTYPE: + subarr = lib.cast_to_nanoseconds(data) + else: + subarr = data + elif data.dtype == _INT64_DTYPE: + subarr = np.asarray(data, dtype=_NS_DTYPE) + else: + subarr = tools.to_datetime(data) + if not np.issubdtype(subarr.dtype, np.datetime64): + raise TypeError('Unable to convert %s to datetime dtype' + % str(data)) + + if tz is not None: + tz = tools._maybe_get_tz(tz) + # Convert local to UTC + ints = subarr.view('i8') + + subarr = lib.tz_localize_to_utc(ints, tz) + subarr = subarr.view(_NS_DTYPE) + + subarr = subarr.view(cls) + subarr.name = name + subarr.offset = offset + subarr.tz = tz + + if verify_integrity and len(subarr) > 0: + if offset is not None and not infer_freq: + inferred = subarr.inferred_freq + if inferred != offset.freqstr: + raise ValueError('Dates do not conform to passed ' + 'frequency') + + if infer_freq: + inferred = subarr.inferred_freq + if inferred: + subarr.offset = to_offset(inferred) + + return subarr + + @classmethod + def _generate(cls, start, end, periods, name, offset, + tz=None, normalize=False): + _normalized = True + + if start is not None: + start = Timestamp(start) + if normalize: + start = normalize_date(start) + _normalized = True + else: + _normalized = _normalized and start.time() == _midnight + + if end is not None: + end = Timestamp(end) + + if normalize: + end = normalize_date(end) + _normalized = True + else: + _normalized = _normalized and end.time() == _midnight + + start, end, tz = tools._figure_out_timezone(start, end, tz) + + if com._count_not_none(start, end, periods) < 2: + raise ValueError('Must specify two of start, end, or periods') + + if (offset._should_cache() and + not (offset._normalize_cache and not _normalized) and + _naive_in_cache_range(start, end)): + index = cls._cached_range(start, end, periods=periods, + offset=offset, name=name) + else: + index = _generate_regular_range(start, end, periods, offset) + + if tz is not None: + # Convert local to UTC + ints = index.view('i8', type=np.ndarray) + index = lib.tz_localize_to_utc(ints, tz) + index = index.view(_NS_DTYPE) + + index = index.view(cls) + index.name = name + index.offset = offset + index.tz = tz + + return index + + @classmethod + def _simple_new(cls, values, name, freq=None, tz=None): + result = values.view(cls) + result.name = name + result.offset = freq + result.tz = tools._maybe_get_tz(tz) + + return result + + @property + def tzinfo(self): + """ + Alias for tz attribute + """ + return self.tz + + @classmethod + def _cached_range(cls, start=None, end=None, periods=None, offset=None, + name=None): + if start is not None: + start = Timestamp(start) + if end is not None: + end = Timestamp(end) + + if offset is None: + raise Exception('Must provide a DateOffset!') + + drc = _daterange_cache + if offset not in _daterange_cache: + xdr = generate_range(offset=offset, start=_CACHE_START, + end=_CACHE_END) + + arr = tools.to_datetime(list(xdr), box=False) + + cachedRange = arr.view(DatetimeIndex) + cachedRange.offset = offset + cachedRange.tz = None + cachedRange.name = None + drc[offset] = cachedRange + else: + cachedRange = drc[offset] + + if start is None: + assert(isinstance(end, Timestamp)) + + end = offset.rollback(end) + + endLoc = cachedRange.get_loc(end) + 1 + startLoc = endLoc - periods + elif end is None: + assert(isinstance(start, Timestamp)) + start = offset.rollforward(start) + + startLoc = cachedRange.get_loc(start) + endLoc = startLoc + periods + else: + if not offset.onOffset(start): + start = offset.rollforward(start) + + if not offset.onOffset(end): + end = offset.rollback(end) + + startLoc = cachedRange.get_loc(start) + endLoc = cachedRange.get_loc(end) + 1 + + indexSlice = cachedRange[startLoc:endLoc] + indexSlice.name = name + indexSlice.offset = offset + + return indexSlice + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return lib.ints_to_pydatetime(self.asi8) + + def __repr__(self): + from pandas.core.format import _format_datetime64 + values = self.values + + freq = None + if self.offset is not None: + freq = self.offset.freqstr + + summary = str(self.__class__) + if len(self) > 0: + first = _format_datetime64(values[0], tz=self.tz) + last = _format_datetime64(values[-1], tz=self.tz) + summary += '\n[%s, ..., %s]' % (first, last) + tagline = '\nLength: %d, Freq: %s, Timezone: %s' + summary += tagline % (len(self), freq, self.tz) + + return summary + + __str__ = __repr__ + + def __reduce__(self): + """Necessary for making this object picklable""" + object_state = list(np.ndarray.__reduce__(self)) + subclass_state = self.name, self.offset, self.tz + object_state[2] = (object_state[2], subclass_state) + return tuple(object_state) + + def __setstate__(self, state): + """Necessary for making this object picklable""" + if len(state) == 2: + nd_state, own_state = state + self.name = own_state[0] + self.offset = own_state[1] + self.tz = own_state[2] + np.ndarray.__setstate__(self, nd_state) + else: # pragma: no cover + np.ndarray.__setstate__(self, state) + + def __add__(self, other): + if isinstance(other, Index): + return self.union(other) + elif isinstance(other, (DateOffset, timedelta)): + return self._add_delta(other) + elif isinstance(other, np.timedelta64): + raise NotImplementedError + elif com.is_integer(other): + return self.shift(other) + else: # pragma: no cover + raise TypeError(other) + + def __sub__(self, other): + if isinstance(other, Index): + return self.diff(other) + elif isinstance(other, (DateOffset, timedelta)): + return self._add_delta(-other) + elif isinstance(other, np.timedelta64): + raise NotImplementedError + elif com.is_integer(other): + return self.shift(-other) + else: # pragma: no cover + raise TypeError(other) + + def _add_delta(self, delta): + if isinstance(delta, (Tick, timedelta)): + inc = offsets._delta_to_nanoseconds(delta) + new_values = (self.asi8 + inc).view(_NS_DTYPE) + else: + new_values = self.astype('O') + delta + return DatetimeIndex(new_values, tz=self.tz, freq='infer') + + def groupby(self, f): + objs = self.asobject + return _algos.groupby_object(objs, f) + + def summary(self, name=None): + if len(self) > 0: + index_summary = ', %s to %s' % (str(self[0]), str(self[-1])) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + result = '%s: %s entries%s' % (name, len(self), index_summary) + if self.freq: + result += '\nFreq: %s' % self.freqstr + + return result + + def append(self, other): + """ + Append a collection of Index options together + + Parameters + ---------- + other : Index or list/tuple of indices + + Returns + ------- + appended : Index + """ + from pandas.core.index import _ensure_compat_concat + + name = self.name + to_concat = [self] + + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) + else: + to_concat.append(other) + + for obj in to_concat: + if isinstance(obj, Index) and obj.name != name: + name = None + break + + to_concat = _ensure_compat_concat(to_concat) + to_concat = [x.values if isinstance(x, Index) else x + for x in to_concat] + + return Index(com._concat_compat(to_concat), name=name) + + def get_duplicates(self): + values = Index.get_duplicates(self) + return DatetimeIndex(values) + + def astype(self, dtype): + dtype = np.dtype(dtype) + + if dtype == np.object_: + return self.asobject + elif dtype == _INT64_DTYPE: + return self.asi8.copy() + else: # pragma: no cover + raise ValueError('Cannot cast DatetimeIndex to dtype %s' % dtype) + + @property + def asi8(self): + # do not cache or you'll create a memory leak + return self.values.view('i8') + + # @property + # def asstruct(self): + # utc = _utc() + # values = self.asi8 + # if self.tz is not None and self.tz is not utc: + # values = lib.tz_convert(values, utc, self.tz) + # return lib.build_field_sarray(values) + + def _get_time_micros(self): + utc = _utc() + values = self.asi8 + if self.tz is not None and self.tz is not utc: + values = lib.tz_convert(values, utc, self.tz) + return lib.get_time_micros(values) + + @property + def asobject(self): + """ + Convert to Index of datetime objects + """ + if isnull(self).any(): + msg = 'DatetimeIndex with NaT cannot be converted to object' + raise ValueError(msg) + return self._get_object_index() + + def tolist(self): + """ + See ndarray.tolist + """ + return list(self.asobject) + + def _get_object_index(self): + boxfunc = lambda x: Timestamp(x, offset=self.offset, tz=self.tz) + boxed_values = lib.map_infer(self.asi8, boxfunc) + return Index(boxed_values, dtype=object) + + def to_pydatetime(self): + """ + Return DatetimeIndex as object ndarray of datetime.datetime objects + + Returns + ------- + datetimes : ndarray + """ + return lib.ints_to_pydatetime(self.asi8, tz=self.tz) + + def to_period(self, freq=None): + """ + Cast to PeriodIndex at a particular frequency + """ + from pandas.tseries.period import PeriodIndex + + if self.freq is None and freq is None: + msg = "You must pass a freq argument as current index has none." + raise ValueError(msg) + + if freq is None: + freq = self.freqstr + + return PeriodIndex(self.values, freq=freq) + + def order(self, return_indexer=False, ascending=True): + """ + Return sorted copy of Index + """ + if return_indexer: + _as = self.argsort() + if not ascending: + _as = _as[::-1] + sorted_index = self.take(_as) + return sorted_index, _as + else: + sorted_values = np.sort(self.values) + if not ascending: + sorted_values = sorted_values[::-1] + return self._simple_new(sorted_values, self.name, None, + self.tz) + + def snap(self, freq='S'): + """ + Snap time stamps to nearest occuring frequency + + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.onOffset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + + def shift(self, n, freq=None): + """ + Specialized shift which produces a DatetimeIndex + + Parameters + ---------- + n : int + Periods to shift by + freq : DateOffset or timedelta-like, optional + + Returns + ------- + shifted : DatetimeIndex + """ + if freq is not None and freq != self.offset: + if isinstance(freq, basestring): + freq = to_offset(freq) + return Index.shift(self, n, freq) + + if n == 0: + # immutable so OK + return self + + if self.offset is None: + raise ValueError("Cannot shift with no offset") + + start = self[0] + n * self.offset + end = self[-1] + n * self.offset + return DatetimeIndex(start=start, end=end, freq=self.offset, + name=self.name) + + def repeat(self, repeats, axis=None): + """ + Analogous to ndarray.repeat + """ + return DatetimeIndex(self.values.repeat(repeats), + name=self.name) + + def take(self, indices, axis=0): + """ + Analogous to ndarray.take + """ + maybe_slice = lib.maybe_indices_to_slice(com._ensure_int64(indices)) + if isinstance(maybe_slice, slice): + return self[maybe_slice] + indices = com._ensure_platform_int(indices) + taken = self.values.take(indices, axis=axis) + return self._simple_new(taken, self.name, None, self.tz) + + def union(self, other): + """ + Specialized union for DatetimeIndex objects. If combine + overlapping ranges with the same DateOffset, will be much + faster than Index.union + + Parameters + ---------- + other : DatetimeIndex or array-like + + Returns + ------- + y : Index or DatetimeIndex + """ + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except TypeError: + pass + + this, other = self._maybe_utc_convert(other) + + if this._can_fast_union(other): + return this._fast_union(other) + else: + result = Index.union(this, other) + if isinstance(result, DatetimeIndex): + result.tz = self.tz + if result.freq is None: + result.offset = to_offset(result.inferred_freq) + return result + + def join(self, other, how='left', level=None, return_indexers=False): + """ + See Index.join + """ + if not isinstance(other, DatetimeIndex) and len(other) > 0: + try: + other = DatetimeIndex(other) + except TypeError: + pass + + this, other = self._maybe_utc_convert(other) + return Index.join(this, other, how=how, level=level, + return_indexers=return_indexers) + + def _maybe_utc_convert(self, other): + this = self + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise Exception('Cannot join tz-naive with tz-aware DatetimeIndex') + elif other.tz is not None: + raise Exception('Cannot join tz-naive with tz-aware DatetimeIndex') + + if self.tz != other.tz: + this = self.tz_convert('UTC') + other = other.tz_convert('UTC') + return this, other + + def _wrap_joined_index(self, joined, other): + name = self.name if self.name == other.name else None + if (isinstance(other, DatetimeIndex) + and self.offset == other.offset + and self._can_fast_union(other)): + joined = self._view_like(joined) + joined.name = name + return joined + else: + return DatetimeIndex(joined, name=name) + + def _can_fast_union(self, other): + if not isinstance(other, DatetimeIndex): + return False + + offset = self.offset + + if offset is None: + return False + + if not self.is_monotonic or not other.is_monotonic: + return False + + if len(self) == 0 or len(other) == 0: + return True + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + left_end = left[-1] + right_start = right[0] + + # Only need to "adjoin", not overlap + return (left_end + offset) >= right_start + + def _fast_union(self, other): + if len(other) == 0: + return self.view(type(self)) + + if len(self) == 0: + return other.view(type(self)) + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + left_start, left_end = left[0], left[-1] + right_end = right[-1] + + if not self.offset._should_cache(): + # concatenate dates + if left_end < right_end: + loc = right.searchsorted(left_end, side='right') + right_chunk = right.values[loc:] + dates = com._concat_compat((left.values, right_chunk)) + return self._view_like(dates) + else: + return left + else: + return type(self)(start=left_start, + end=max(left_end, right_end), + freq=left.offset) + + def __array_finalize__(self, obj): + if self.ndim == 0: # pragma: no cover + return self.item() + + self.offset = getattr(obj, 'offset', None) + self.tz = getattr(obj, 'tz', None) + + def intersection(self, other): + """ + Specialized intersection for DatetimeIndex objects. May be much faster + than Index.union + + Parameters + ---------- + other : DatetimeIndex or array-like + + Returns + ------- + y : Index or DatetimeIndex + """ + if not isinstance(other, DatetimeIndex): + try: + other = DatetimeIndex(other) + except TypeError: + pass + result = Index.intersection(self, other) + if isinstance(result, DatetimeIndex): + if result.freq is None: + result.offset = to_offset(result.inferred_freq) + return result + + elif other.offset != self.offset or (not self.is_monotonic or + not other.is_monotonic): + result = Index.intersection(self, other) + if isinstance(result, DatetimeIndex): + if result.freq is None: + result.offset = to_offset(result.inferred_freq) + return result + + # to make our life easier, "sort" the two ranges + if self[0] <= other[0]: + left, right = self, other + else: + left, right = other, self + + end = min(left[-1], right[-1]) + start = right[0] + + if end < start: + return type(self)(data=[]) + else: + lslice = slice(*left.slice_locs(start, end)) + left_chunk = left.values[lslice] + return self._view_like(left_chunk) + + def _partial_date_slice(self, reso, parsed): + if not self.is_monotonic: + raise TimeSeriesError('Partial indexing only valid for ordered time' + ' series') + + if reso == 'year': + t1 = Timestamp(datetime(parsed.year, 1, 1)) + t2 = Timestamp(datetime(parsed.year, 12, 31)) + elif reso == 'month': + d = lib.monthrange(parsed.year, parsed.month)[1] + t1 = Timestamp(datetime(parsed.year, parsed.month, 1)) + t2 = Timestamp(datetime(parsed.year, parsed.month, d)) + elif reso == 'quarter': + qe = (((parsed.month - 1) + 2) % 12) + 1 # two months ahead + d = lib.monthrange(parsed.year, qe)[1] # at end of month + t1 = Timestamp(datetime(parsed.year, parsed.month, 1)) + t2 = Timestamp(datetime(parsed.year, qe, d)) + else: + raise KeyError + + stamps = self.asi8 + left = stamps.searchsorted(t1.value, side='left') + right = stamps.searchsorted(t2.value, side='right') + return slice(left, right) + + def _possibly_promote(self, other): + if other.inferred_type == 'date': + other = DatetimeIndex(other) + return self, other + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + try: + return Index.get_value(self, series, key) + except KeyError: + + try: + loc = self._get_string_slice(key) + return series[loc] + except (TypeError, ValueError, KeyError): + pass + + if isinstance(key, time): + locs = self.indexer_at_time(key) + return series.take(locs) + + stamp = Timestamp(key) + try: + return self._engine.get_value(series, stamp) + except KeyError: + raise KeyError(stamp) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + try: + return self._engine.get_loc(key) + except KeyError: + try: + return self._get_string_slice(key) + except (TypeError, KeyError, ValueError): + pass + + if isinstance(key, time): + return self.indexer_at_time(key) + + try: + return self._engine.get_loc(Timestamp(key)) + except (KeyError, ValueError): + raise KeyError(key) + + def _get_string_slice(self, key): + freq = getattr(self, 'freqstr', + getattr(self, 'inferred_freq', None)) + asdt, parsed, reso = parse_time_string(key, freq) + key = asdt + loc = self._partial_date_slice(reso, parsed) + return loc + + def slice_locs(self, start=None, end=None): + """ + Index.slice_locs, customized to handle partial ISO-8601 string slicing + """ + if isinstance(start, basestring) or isinstance(end, basestring): + try: + if start: + start_loc = self._get_string_slice(start).start + else: + start_loc = 0 + + if end: + end_loc = self._get_string_slice(end).stop + else: + end_loc = len(self) + + return start_loc, end_loc + except KeyError: + pass + + return Index.slice_locs(self, start, end) + + def __getitem__(self, key): + """Override numpy.ndarray's __getitem__ method to work as desired""" + arr_idx = self.view(np.ndarray) + if np.isscalar(key): + val = arr_idx[key] + return Timestamp(val, offset=self.offset, tz=self.tz) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + key = lib.maybe_booleans_to_slice(key.view(np.uint8)) + + new_offset = None + if isinstance(key, slice): + if self.offset is not None and key.step is not None: + new_offset = key.step * self.offset + else: + new_offset = self.offset + + result = arr_idx[key] + if result.ndim > 1: + return result + + return self._simple_new(result, self.name, new_offset, self.tz) + + # Try to run function on index first, and then on elements of index + # Especially important for group-by functionality + def map(self, f): + try: + return f(self) + except: + return _algos.arrmap_object(self.asobject, f) + + # alias to offset + @property + def freq(self): + return self.offset + + @cache_readonly + def inferred_freq(self): + try: + return infer_freq(self) + except ValueError: + return None + + @property + def freqstr(self): + return self.offset.freqstr + + year = _field_accessor('year', 'Y') + month = _field_accessor('month', 'M') + day = _field_accessor('day', 'D') + hour = _field_accessor('hour', 'h') + minute = _field_accessor('minute', 'm') + second = _field_accessor('second', 's') + microsecond = _field_accessor('microsecond', 'us') + nanosecond = _field_accessor('nanosecond', 'ns') + weekofyear = _field_accessor('weekofyear', 'woy') + week = weekofyear + dayofweek = _field_accessor('dayofweek', 'dow') + weekday = dayofweek + dayofyear = _field_accessor('dayofyear', 'doy') + quarter = _field_accessor('quarter', 'q') + + def normalize(self): + """ + Return DatetimeIndex with times to midnight. Length is unaltered + + Returns + ------- + normalized : DatetimeIndex + """ + new_values = lib.date_normalize(self.asi8) + return DatetimeIndex(new_values, freq='infer', name=self.name) + + def __iter__(self): + return iter(self._get_object_index()) + + def searchsorted(self, key, side='left'): + if isinstance(key, np.ndarray): + key = np.array(key, dtype=_NS_DTYPE, copy=False) + else: + key = _to_m8(key) + + return self.values.searchsorted(key, side=side) + + def is_type_compatible(self, typ): + return typ == self.inferred_type or typ == 'datetime' + + def argmin(self): + # hack to workaround argmin failure + try: + return self.values.argmin() + except Exception: # pragma: no cover + return self.asi8.argmin() + + @property + def inferred_type(self): + # b/c datetime is represented as microseconds since the epoch, make + # sure we can't have ambiguous indexing + return 'datetime64' + + @property + def dtype(self): + return _NS_DTYPE + + @property + def is_all_dates(self): + return True + + @cache_readonly + def is_normalized(self): + """ + Returns True if all of the dates are at midnight ("no time") + """ + return lib.dates_normalized(self.asi8) + + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self is other: + return True + + if (not hasattr(other, 'inferred_type') or + other.inferred_type != 'datetime64'): + if self.offset is not None: + return False + try: + other = DatetimeIndex(other) + except: + return False + + if self.tz is not None: + if other.tz is None: + return False + same_zone = self.tz.zone == other.tz.zone + else: + if other.tz is not None: + return False + same_zone = True + + return same_zone and np.array_equal(self.asi8, other.asi8) + + def insert(self, loc, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + """ + if type(item) == datetime: + item = _to_m8(item) + + new_index = np.concatenate((self[:loc].asi8, + [item.view(np.int64)], + self[loc:].asi8)) + return DatetimeIndex(new_index, freq='infer') + + def _view_like(self, ndarray): + result = ndarray.view(type(self)) + result.offset = self.offset + result.tz = self.tz + result.name = self.name + return result + + def tz_convert(self, tz): + """ + Convert DatetimeIndex from one time zone to another (using pytz) + + Returns + ------- + normalized : DatetimeIndex + """ + tz = tools._maybe_get_tz(tz) + + if self.tz is None: + # tz naive, use tz_localize + raise Exception('Cannot convert tz-naive timestamps, use ' + 'tz_localize to localize') + + # No conversion since timestamps are all UTC to begin with + return self._simple_new(self.values, self.name, self.offset, tz) + + def tz_localize(self, tz): + """ + Localize tz-naive DatetimeIndex to given time zone (using pytz) + + Returns + ------- + localized : DatetimeIndex + """ + if self.tz is not None: + raise ValueError("Already tz-aware, use tz_convert to convert.") + tz = tools._maybe_get_tz(tz) + + # Convert to UTC + new_dates = lib.tz_localize_to_utc(self.asi8, tz) + new_dates = new_dates.view(_NS_DTYPE) + + return self._simple_new(new_dates, self.name, self.offset, tz) + + def indexer_at_time(self, time, asof=False): + """ + Select values at particular time of day (e.g. 9:30AM) + + Parameters + ---------- + time : datetime.time or string + tz : string or pytz.timezone + Time zone for time. Corresponding timestamps would be converted to + time zone of the TimeSeries + + Returns + ------- + values_at_time : TimeSeries + """ + from dateutil.parser import parse + + if asof: + raise NotImplementedError + + if isinstance(time, basestring): + time = parse(time).time() + + if time.tzinfo: + # TODO + raise NotImplementedError + + time_micros = self._get_time_micros() + micros = _time_to_micros(time) + return (micros == time_micros).nonzero()[0] + + def indexer_between_time(self, start_time, end_time, include_start=True, + include_end=True): + """ + Select values between particular times of day (e.g., 9:00-9:30AM) + + Parameters + ---------- + start_time : datetime.time or string + end_time : datetime.time or string + include_start : boolean, default True + include_end : boolean, default True + tz : string or pytz.timezone, default None + + Returns + ------- + values_between_time : TimeSeries + """ + from dateutil.parser import parse + + if isinstance(start_time, basestring): + start_time = parse(start_time).time() + + if isinstance(end_time, basestring): + end_time = parse(end_time).time() + + if start_time.tzinfo or end_time.tzinfo: + raise NotImplementedError + + time_micros = self._get_time_micros() + start_micros = _time_to_micros(start_time) + end_micros = _time_to_micros(end_time) + + if include_start and include_end: + mask = ((start_micros <= time_micros) & + (time_micros <= end_micros)) + elif include_start: + mask = ((start_micros <= time_micros) & + (time_micros < end_micros)) + elif include_end: + mask = ((start_micros < time_micros) & + (time_micros <= end_micros)) + else: + mask = ((start_micros < time_micros) & + (time_micros < end_micros)) + + return mask.nonzero()[0] + +def _generate_regular_range(start, end, periods, offset): + if isinstance(offset, Tick): + stride = offset.nanos + if periods is None: + b = Timestamp(start).value + e = Timestamp(end).value + e += stride - e % stride + elif start is not None: + b = Timestamp(start).value + e = b + periods * stride + elif end is not None: + e = Timestamp(end).value + stride + b = e - periods * stride + else: + raise NotImplementedError + + data = np.arange(b, e, stride, dtype=np.int64) + data = data.view(_NS_DTYPE) + else: + xdr = generate_range(start=start, end=end, + periods=periods, offset=offset) + + data = np.array(list(xdr), dtype=_NS_DTYPE) + + return data + + +def date_range(start=None, end=None, periods=None, freq='D', tz=None, + normalize=False): + """ + Return a fixed frequency datetime index, with day (calendar) as the default + frequency + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating dates + end : string or datetime-like, default None + Right bound for generating dates + periods : integer or None, default None + If None, must specify start and end + freq : string or DateOffset, default 'D' (calendar daily) + Frequency strings can have multiples, e.g. '5H' + tz : string or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Beijing + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : DatetimeIndex + """ + return DatetimeIndex(start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize) + + +def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, + normalize=True): + """ + Return a fixed frequency datetime index, with business day as the default + frequency + + Parameters + ---------- + start : string or datetime-like, default None + Left bound for generating dates + end : string or datetime-like, default None + Right bound for generating dates + periods : integer or None, default None + If None, must specify start and end + freq : string or DateOffset, default 'B' (business daily) + Frequency strings can have multiples, e.g. '5H' + tz : string or None + Time zone name for returning localized DatetimeIndex, for example + Asia/Beijing + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + + Notes + ----- + 2 of start, end, or periods must be specified + + Returns + ------- + rng : DatetimeIndex + """ + + return DatetimeIndex(start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize) + + +def _to_m8(key): + ''' + Timestamp-like => dt64 + ''' + if not isinstance(key, datetime): + # this also converts strings + key = Timestamp(key) + + return np.int64(lib.pydt_to_i8(key)).view(_NS_DTYPE) + + + +def _str_to_dt_array(arr, offset=None): + def parser(x): + result = parse_time_string(x, offset) + return result[0] + + arr = np.asarray(arr, dtype=object) + data = _algos.arrmap_object(arr, parser) + return tools.to_datetime(data) + + +_CACHE_START = Timestamp(datetime(1950, 1, 1)) +_CACHE_END = Timestamp(datetime(2030, 1, 1)) + +_daterange_cache = {} + + +def _naive_in_cache_range(start, end): + if start is None or end is None: + return False + else: + return _in_range(start, end, _CACHE_START, _CACHE_END) + +def _in_range(start, end, rng_start, rng_end): + return start > rng_start and end < rng_end + +def _time_to_micros(time): + seconds = time.hour * 60 * 60 + 60 * time.minute + time.second + return 1000000 * seconds + time.microsecond + + diff --git a/pandas/tseries/interval.py b/pandas/tseries/interval.py new file mode 100644 index 00000000..58c16dcf --- /dev/null +++ b/pandas/tseries/interval.py @@ -0,0 +1,35 @@ +import numpy as np + +from pandas.core.index import Index + +class Interval(object): + """ + Represents an interval of time defined by two timestamps + """ + + def __init__(self, start, end): + self.start = start + self.end = end + +class PeriodInterval(object): + """ + Represents an interval of time defined by two Period objects (time ordinals) + """ + + def __init__(self, start, end): + self.start = start + self.end = end + + +class IntervalIndex(Index): + """ + + """ + def __new__(self, starts, ends): + pass + + def dtype(self): + return self.values.dtype + +if __name__ == '__main__': + pass diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py new file mode 100644 index 00000000..f35a8597 --- /dev/null +++ b/pandas/tseries/offsets.py @@ -0,0 +1,1173 @@ +from datetime import datetime, timedelta + +import numpy as np + +from pandas.core.common import _count_not_none +from pandas.tseries.tools import to_datetime +from pandas.util.decorators import cache_readonly + +# import after tools, dateutil check +from dateutil.relativedelta import relativedelta + +from pandas.lib import Timestamp +import pandas.lib as lib + +__all__ = ['Day', 'BusinessDay', 'BDay', + 'MonthBegin', 'BMonthBegin', 'MonthEnd', 'BMonthEnd', + 'YearBegin', 'BYearBegin', 'YearEnd', 'BYearEnd', + 'QuarterBegin', 'BQuarterBegin', 'QuarterEnd', 'BQuarterEnd', + 'Week', 'WeekOfMonth', + 'Hour', 'Minute', 'Second', 'Milli', 'Micro', 'Nano'] + +#---------------------------------------------------------------------- +# DateOffset + + +class CacheableOffset(object): + + _cacheable = True + + +class DateOffset(object): + """ + Standard kind of date increment used for a date range. + + Works exactly like relativedelta in terms of the keyword args you + pass in, use of the keyword n is discouraged-- you would be better + off specifying n in the keywords you use, but regardless it is + there for you. n is needed for DateOffset subclasses. + + DateOffets work as follows. Each offset specify a set of dates + that conform to the DateOffset. For example, Bday defines this + set to be the set of dates that are weekdays (M-F). To test if a + date is in the set of a DateOffset dateOffset we can use the + onOffset method: dateOffset.onOffset(date). + + If a date is not on a valid date, the rollback and rollforward + methods can be used to roll the date to the nearest valid date + before/after the date. + + DateOffsets can be created to move dates forward a given number of + valid dates. For example, Bday(2) can be added to a date to move + it two business days forward. If the date does not start on a + valid date, first it is moved to a valid date. Thus psedo code + is: + + def __add__(date): + date = rollback(date) # does nothing is date is valid + return date + + + When a date offset is created for a negitive number of periods, + the date is first rolled forward. The pseudo code is: + + def __add__(date): + date = rollforward(date) # does nothing is date is valid + return date + + + Zero presents a problem. Should it roll forward or back? We + arbitrarily have it rollforward: + + date + BDay(0) == BDay.rollforward(date) + + Since 0 is a bit weird, we suggest avoiding its use. + """ + _cacheable = False + _normalize_cache = True + + def __init__(self, n=1, **kwds): + self.n = int(n) + self.kwds = kwds + if len(kwds) > 0: + self._offset = relativedelta(**kwds) + else: + self._offset = timedelta(1) + + def apply(self, other): + if len(self.kwds) > 0: + if self.n > 0: + for i in xrange(self.n): + other = other + self._offset + else: + for i in xrange(-self.n): + other = other - self._offset + return other + else: + return other + timedelta(self.n) + + def isAnchored(self): + return (self.n == 1) + + def copy(self): + return self.__class__(self.n, **self.kwds) + + def _should_cache(self): + return self.isAnchored() and self._cacheable + + def _params(self): + attrs = [(k, v) for k, v in vars(self).iteritems() + if k not in ['kwds', '_offset', 'name']] + attrs.extend(self.kwds.items()) + attrs = sorted(set(attrs)) + + params = tuple([str(self.__class__)] + attrs) + return params + + def __repr__(self): + if hasattr(self, 'name') and len(self.name): + return self.name + + className = getattr(self, '_outputName', type(self).__name__) + exclude = set(['n', 'inc']) + attrs = [] + for attr in self.__dict__: + if ((attr == 'kwds' and len(self.kwds) == 0) + or attr.startswith('_')): + continue + if attr not in exclude: + attrs.append('='.join((attr, repr(getattr(self, attr))))) + + if abs(self.n) != 1: + plural = 's' + else: + plural = '' + + out = '<%s ' % self.n + className + plural + if attrs: + out += ': ' + ', '.join(attrs) + out += '>' + return out + + def __eq__(self, other): + if other is None: + return False + + if isinstance(other, basestring): + from pandas.tseries.frequencies import to_offset + other = to_offset(other) + + if not isinstance(other, DateOffset): + return False + + return self._params() == other._params() + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(self._params()) + + def __call__(self, other): + return self.apply(other) + + def __add__(self, other): + return self.apply(other) + + def __radd__(self, other): + return self.__add__(other) + + def __sub__(self, other): + if isinstance(other, datetime): + raise TypeError('Cannot subtract datetime from offset!') + elif type(other) == type(self): + return self.__class__(self.n - other.n, **self.kwds) + else: # pragma: no cover + raise TypeError('Cannot subtract %s from %s' + % (type(other), type(self))) + + def __rsub__(self, other): + return self.__class__(-self.n, **self.kwds) + other + + def __mul__(self, someInt): + return self.__class__(n=someInt * self.n, **self.kwds) + + def __rmul__(self, someInt): + return self.__mul__(someInt) + + def __neg__(self): + return self.__class__(-self.n, **self.kwds) + + def rollback(self, someDate): + """Roll provided date backward to next offset only if not on offset""" + if not self.onOffset(someDate): + someDate = someDate - self.__class__(1, **self.kwds) + return someDate + + def rollforward(self, dt): + """Roll provided date forward to next offset only if not on offset""" + if not self.onOffset(dt): + dt = dt + self.__class__(1, **self.kwds) + return dt + + def onOffset(self, dt): + if type(self) == DateOffset: + return True + + # Default (slow) method for determining if some date is a member of the + # date range generated by this offset. Subclasses may have this + # re-implemented in a nicer way. + a = dt + b = ((dt + self) - self) + return a == b + + @property + def rule_code(self): + raise NotImplementedError + + @property + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = '%d%s' % (self.n, code) + else: + fstr = code + + return fstr + +class BusinessDay(CacheableOffset, DateOffset): + """ + DateOffset subclass representing possibly n business days + """ + def __init__(self, n=1, **kwds): + self.n = int(n) + self.kwds = kwds + self.offset = kwds.get('offset', timedelta(0)) + self.normalize = kwds.get('normalize', False) + + @property + def rule_code(self): + return 'B' + + def __repr__(self): + if hasattr(self, 'name') and len(self.name): + return self.name + + className = getattr(self, '_outputName', self.__class__.__name__) + attrs = [] + + if self.offset: + attrs = ['offset=%s' % repr(self.offset)] + + if abs(self.n) != 1: + plural = 's' + else: + plural = '' + + out = '<%s ' % self.n + className + plural + if attrs: + out += ': ' + ', '.join(attrs) + out += '>' + return out + + @property + def freqstr(self): + try: + code = self.rule_code + except NotImplementedError: + return repr(self) + + if self.n != 1: + fstr = '%d%s' % (self.n, code) + else: + fstr = code + + if self.offset: + fstr += self._offset_str() + + return fstr + + def _offset_str(self): + def get_str(td): + off_str = '' + if td.days > 0: + off_str += str(td.days) + 'D' + if td.seconds > 0: + s = td.seconds + hrs = int(s / 3600) + if hrs != 0: + off_str += str(hrs) + 'H' + s -= hrs * 3600 + mts = int(s / 60) + if mts != 0: + off_str += str(mts) + 'Min' + s -= mts * 60 + if s != 0: + off_str += str(s) + 's' + if td.microseconds > 0: + off_str += str(td.microseconds) + 'us' + return off_str + + if isinstance(self.offset, timedelta): + zero = timedelta(0, 0, 0) + if self.offset >= zero: + off_str = '+' + get_str(self.offset) + else: + off_str = '-' + get_str(-self.offset) + return off_str + else: + return '+' + repr(self.offset) + + def isAnchored(self): + return (self.n == 1) + + def apply(self, other): + if isinstance(other, datetime): + n = self.n + + if n == 0 and other.weekday() > 4: + n = 1 + + result = other + + while n != 0: + k = n // abs(n) + result = result + timedelta(k) + if result.weekday() < 5: + n -= k + + if self.normalize: + result = datetime(result.year, result.month, result.day) + + if self.offset: + result = result + self.offset + + return result + + elif isinstance(other, (timedelta, Tick)): + return BDay(self.n, offset=self.offset + other, + normalize=self.normalize) + else: + raise Exception('Only know how to combine business day with ' + 'datetime or timedelta!') + @classmethod + def onOffset(cls, dt): + return dt.weekday() < 5 + + +class MonthEnd(DateOffset, CacheableOffset): + """DateOffset of one month end""" + + def apply(self, other): + other = datetime(other.year, other.month, other.day) + + n = self.n + _, days_in_month = lib.monthrange(other.year, other.month) + if other.day != days_in_month: + other = other + relativedelta(months=-1, day=31) + if n <= 0: + n = n + 1 + other = other + relativedelta(months=n, day=31) + return other + + @classmethod + def onOffset(cls, dt): + days_in_month = lib.monthrange(dt.year, dt.month)[1] + return dt.day == days_in_month + + @property + def rule_code(self): + return 'M' + + +class MonthBegin(DateOffset, CacheableOffset): + """DateOffset of one month at beginning""" + + def apply(self, other): + n = self.n + + if other.day > 1 and n <= 0: #then roll forward if n<=0 + n += 1 + + other = other + relativedelta(months=n, day=1) + return other + + @classmethod + def onOffset(cls, dt): + return dt.day == 1 + + @property + def rule_code(self): + return 'MS' + + +class BusinessMonthEnd(CacheableOffset, DateOffset): + """DateOffset increments between business EOM dates""" + + def isAnchored(self): + return (self.n == 1) + + def apply(self, other): + other = datetime(other.year, other.month, other.day) + + n = self.n + + wkday, days_in_month = lib.monthrange(other.year, other.month) + lastBDay = days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0) + + if n > 0 and not other.day >= lastBDay: + n = n - 1 + elif n <= 0 and other.day > lastBDay: + n = n + 1 + other = other + relativedelta(months=n, day=31) + + if other.weekday() > 4: + other = other - BDay() + return other + + @property + def rule_code(self): + return 'BM' + + +class BusinessMonthBegin(DateOffset, CacheableOffset): + """DateOffset of one business month at beginning""" + + def apply(self, other): + n = self.n + + wkday, _ = lib.monthrange(other.year, other.month) + first = _get_firstbday(wkday) + + if other.day > first and n<=0: + # as if rolled forward already + n += 1 + + other = other + relativedelta(months=n) + wkday, _ = lib.monthrange(other.year, other.month) + first = _get_firstbday(wkday) + result = datetime(other.year, other.month, first) + return result + + @classmethod + def onOffset(cls, dt): + first_weekday, _ = lib.monthrange(dt.year, dt.month) + if first_weekday == 5: + return dt.day == 3 + elif first_weekday == 6: + return dt.day == 2 + else: + return dt.day == 1 + + @property + def rule_code(self): + return 'BMS' + + +class Week(DateOffset, CacheableOffset): + """ + Weekly offset + + Parameters + ---------- + weekday : int, default None + Always generate specific day of week. 0 for Monday + """ + def __init__(self, n=1, **kwds): + self.n = n + self.weekday = kwds.get('weekday', None) + + if self.weekday is not None: + if self.weekday < 0 or self.weekday > 6: + raise Exception('Day must be 0<=day<=6, got %d' % + self.weekday) + + self._inc = timedelta(weeks=1) + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.weekday is not None) + + def apply(self, other): + if self.weekday is None: + return other + self.n * self._inc + + if self.n > 0: + k = self.n + otherDay = other.weekday() + if otherDay != self.weekday: + other = other + timedelta((self.weekday - otherDay) % 7) + k = k - 1 + for i in xrange(k): + other = other + self._inc + else: + k = self.n + otherDay = other.weekday() + if otherDay != self.weekday: + other = other + timedelta((self.weekday - otherDay) % 7) + for i in xrange(-k): + other = other - self._inc + return other + + def onOffset(self, dt): + return dt.weekday() == self.weekday + + @property + def rule_code(self): + suffix = '' + if self.weekday is not None: + suffix = '-%s' % (_weekday_dict[self.weekday]) + return 'W' + suffix + +_weekday_dict = { + 0: 'MON', + 1: 'TUE', + 2: 'WED', + 3: 'THU', + 4: 'FRI', + 5: 'SAT', + 6: 'SUN' +} + +class WeekOfMonth(DateOffset, CacheableOffset): + """ + Describes monthly dates like "the Tuesday of the 2nd week of each month" + + Parameters + ---------- + n : int + week : {0, 1, 2, 3, ...} + 0 is 1st week of month, 1 2nd week, etc. + weekday : {0, 1, ..., 6} + 0: Mondays + 1: Tuedays + 2: Wednesdays + 3: Thursdays + 4: Fridays + 5: Saturdays + 6: Sundays + """ + def __init__(self, n=1, **kwds): + self.n = n + self.weekday = kwds['weekday'] + self.week = kwds['week'] + + if self.n == 0: + raise Exception('N cannot be 0') + + if self.weekday < 0 or self.weekday > 6: + raise Exception('Day must be 0<=day<=6, got %d' % + self.weekday) + if self.week < 0 or self.week > 3: + raise Exception('Week must be 0<=day<=3, got %d' % + self.week) + + self.kwds = kwds + + def apply(self, other): + offsetOfMonth = self.getOffsetOfMonth(other) + + if offsetOfMonth > other: + if self.n > 0: + months = self.n - 1 + else: + months = self.n + elif offsetOfMonth == other: + months = self.n + else: + if self.n > 0: + months = self.n + else: + months = self.n + 1 + + return self.getOffsetOfMonth(other + relativedelta(months=months, day=1)) + + def getOffsetOfMonth(self, dt): + w = Week(weekday=self.weekday) + d = datetime(dt.year, dt.month, 1) + + d = w.rollforward(d) + + for i in xrange(self.week): + d = w.apply(d) + + return d + + def onOffset(self, dt): + return dt == self.getOffsetOfMonth(dt) + + @property + def rule_code(self): + suffix = '-%d%s' % (self.week + 1, _weekday_dict.get(self.weekday, '')) + return 'WOM' + suffix + + +class BQuarterEnd(DateOffset, CacheableOffset): + """DateOffset increments between business Quarter dates + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/30/2007, 6/29/2007, ... + """ + _outputName = 'BusinessQuarterEnd' + + def __init__(self, n=1, **kwds): + self.n = n + self.startingMonth = kwds.get('startingMonth', 3) + + self.offset = BMonthEnd(3) + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + def apply(self, other): + n = self.n + + wkday, days_in_month = lib.monthrange(other.year, other.month) + lastBDay = days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0) + + monthsToGo = 3 - ((other.month - self.startingMonth) % 3) + if monthsToGo == 3: + monthsToGo = 0 + + if n > 0 and not (other.day >= lastBDay and monthsToGo == 0): + n = n - 1 + elif n <= 0 and other.day > lastBDay and monthsToGo == 0: + n = n + 1 + + other = other + relativedelta(months=monthsToGo + 3*n, day=31) + + if other.weekday() > 4: + other = other - BDay() + + return other + + def onOffset(self, dt): + modMonth = (dt.month - self.startingMonth) % 3 + return BMonthEnd().onOffset(dt) and modMonth == 0 + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.startingMonth] + return 'BQ' + suffix + + +_month_dict = { + 1: 'JAN', + 2: 'FEB', + 3: 'MAR', + 4: 'APR', + 5: 'MAY', + 6: 'JUN', + 7: 'JUL', + 8: 'AUG', + 9: 'SEP', + 10: 'OCT', + 11: 'NOV', + 12: 'DEC' +} + + +class BQuarterBegin(DateOffset, CacheableOffset): + _outputName = "BusinessQuarterBegin" + + def __init__(self, n=1, **kwds): + self.n = n + self.startingMonth = kwds.get('startingMonth', 3) + + self.offset = BMonthBegin(3) + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + def apply(self, other): + n = self.n + + wkday, _ = lib.monthrange(other.year, other.month) + + first = _get_firstbday(wkday) + + monthsSince = (other.month - self.startingMonth) % 3 + + if n <= 0 and monthsSince != 0: # make sure to roll forward so negate + monthsSince = monthsSince - 3 + + # roll forward if on same month later than first bday + if n <= 0 and (monthsSince == 0 and other.day > first): + n = n + 1 + # pretend to roll back if on same month but before firstbday + elif n > 0 and (monthsSince == 0 and other.day < first): + n = n - 1 + + # get the first bday for result + other = other + relativedelta(months=3*n - monthsSince) + wkday, _ = lib.monthrange(other.year, other.month) + first = _get_firstbday(wkday) + result = datetime(other.year, other.month, first, + other.hour, other.minute, other.second, + other.microsecond) + return result + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.startingMonth] + return 'BQS' + suffix + + +class QuarterEnd(DateOffset, CacheableOffset): + """DateOffset increments between business Quarter dates + startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... + startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... + startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... + """ + _outputName = 'QuarterEnd' + + def __init__(self, n=1, **kwds): + self.n = n + self.startingMonth = kwds.get('startingMonth', 3) + + self.offset = MonthEnd(3) + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + def apply(self, other): + n = self.n + + wkday, days_in_month = lib.monthrange(other.year, other.month) + + monthsToGo = 3 - ((other.month - self.startingMonth) % 3) + if monthsToGo == 3: + monthsToGo = 0 + + if n > 0 and not (other.day >= days_in_month and monthsToGo == 0): + n = n - 1 + + other = other + relativedelta(months=monthsToGo + 3*n, day=31) + + return other + + def onOffset(self, dt): + modMonth = (dt.month - self.startingMonth) % 3 + return MonthEnd().onOffset(dt) and modMonth == 0 + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.startingMonth] + return 'Q' + suffix + + +class QuarterBegin(DateOffset, CacheableOffset): + _outputName = 'QuarterBegin' + + def __init__(self, n=1, **kwds): + self.n = n + self.startingMonth = kwds.get('startingMonth', 3) + + self.offset = MonthBegin(3) + self.kwds = kwds + + def isAnchored(self): + return (self.n == 1 and self.startingMonth is not None) + + def apply(self, other): + n = self.n + + wkday, days_in_month = lib.monthrange(other.year, other.month) + + monthsSince = (other.month - self.startingMonth) % 3 + + if n <= 0 and monthsSince != 0: + # make sure you roll forward, so negate + monthsSince = monthsSince - 3 + + if n < 0 and (monthsSince == 0 and other.day > 1): + # after start, so come back an extra period as if rolled forward + n = n + 1 + + other = other + relativedelta(months=3*n - monthsSince, day=1) + return other + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.startingMonth] + return 'QS' + suffix + + +class BYearEnd(DateOffset, CacheableOffset): + """DateOffset increments between business EOM dates""" + _outputName = 'BusinessYearEnd' + + def __init__(self, n=1, **kwds): + self.month = kwds.get('month', 12) + + if self.month < 1 or self.month > 12: + raise ValueError('Month must go from 1 to 12') + + DateOffset.__init__(self, n=n, **kwds) + + def apply(self, other): + n = self.n + + wkday, days_in_month = lib.monthrange(other.year, self.month) + lastBDay = (days_in_month - + max(((wkday + days_in_month - 1) % 7) - 4, 0)) + + years = n + if n > 0: + if (other.month < self.month or + (other.month == self.month and other.day < lastBDay)): + years -= 1 + elif n <= 0: + if (other.month > self.month or + (other.month == self.month and other.day > lastBDay)): + years += 1 + + other = other + relativedelta(years=years) + + _, days_in_month = lib.monthrange(other.year, self.month) + result = datetime(other.year, self.month, days_in_month, + other.hour, other.minute, other.second, + other.microsecond) + + if result.weekday() > 4: + result = result - BDay() + + return result + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.month] + return 'BA' + suffix + + +class BYearBegin(DateOffset, CacheableOffset): + """DateOffset increments between business year begin dates""" + _outputName = 'BusinessYearBegin' + + def __init__(self, n=1, **kwds): + self.month = kwds.get('month', 1) + + if self.month < 1 or self.month > 12: + raise ValueError('Month must go from 1 to 12') + + DateOffset.__init__(self, n=n, **kwds) + + def apply(self, other): + n = self.n + + wkday, days_in_month = lib.monthrange(other.year, self.month) + + first = _get_firstbday(wkday) + + years = n + + + if n > 0: # roll back first for positive n + if (other.month < self.month or + (other.month == self.month and other.day < first)): + years -= 1 + elif n <= 0: # roll forward + if (other.month > self.month or + (other.month == self.month and other.day > first)): + years += 1 + + # set first bday for result + other = other + relativedelta(years = years) + wkday, days_in_month = lib.monthrange(other.year, self.month) + first = _get_firstbday(wkday) + return datetime(other.year, self.month, first) + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.month] + return 'BAS' + suffix + + +class YearEnd(DateOffset, CacheableOffset): + """DateOffset increments between calendar year ends""" + + def __init__(self, n=1, **kwds): + self.month = kwds.get('month', 12) + + if self.month < 1 or self.month > 12: + raise ValueError('Month must go from 1 to 12') + + DateOffset.__init__(self, n=n, **kwds) + + def apply(self, other): + def _increment(date): + if date.month == self.month: + _, days_in_month = lib.monthrange(date.year, self.month) + if date.day != days_in_month: + year = date.year + else: + year = date.year + 1 + elif date.month < self.month: + year = date.year + else: + year = date.year + 1 + _, days_in_month = lib.monthrange(year, self.month) + return datetime(year, self.month, days_in_month, + date.hour, date.minute, date.second, + date.microsecond) + def _decrement(date): + year = date.year if date.month > self.month else date.year - 1 + _, days_in_month = lib.monthrange(year, self.month) + return datetime(year, self.month, days_in_month, + date.hour, date.minute, date.second, + date.microsecond) + + def _rollf(date): + if (date.month != self.month or + date.day < lib.monthrange(date.year, date.month)[1]): + date = _increment(date) + return date + + n = self.n + result = other + if n > 0: + while n > 0: + result = _increment(result) + n -= 1 + elif n < 0: + while n < 0: + result = _decrement(result) + n += 1 + else: + # n == 0, roll forward + result = _rollf(result) + + return result + + def onOffset(self, dt): + wkday, days_in_month = lib.monthrange(dt.year, self.month) + return self.month == dt.month and dt.day == days_in_month + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.month] + return 'A' + suffix + + +class YearBegin(DateOffset, CacheableOffset): + """DateOffset increments between calendar year begin dates""" + + def __init__(self, n=1, **kwds): + self.month = kwds.get('month', 12) + + if self.month < 1 or self.month > 12: + raise ValueError('Month must go from 1 to 12') + + DateOffset.__init__(self, n=n, **kwds) + + def apply(self, other): + n = self.n + if other.month != 1 or other.day != 1: + other = datetime(other.year, 1, 1, + other.hour, other.minute, other.second, + other.microsecond) + if n <= 0: + n = n + 1 + other = other + relativedelta(years = n, day=1) + return other + + @classmethod + def onOffset(cls, dt): + return dt.month == 1 and dt.day == 1 + + @property + def rule_code(self): + suffix = '-%s' % _month_dict[self.month] + return 'AS' + suffix + + +#---------------------------------------------------------------------- +# Ticks + +class Tick(DateOffset): + _inc = timedelta(microseconds=1000) + + def __add__(self, other): + if isinstance(other, Tick): + if type(self) == type(other): + return type(self)(self.n + other.n) + else: + return _delta_to_tick(self.delta + other.delta) + return self.apply(other) + + def __eq__(self, other): + if isinstance(other, basestring): + from pandas.tseries.frequencies import to_offset + other = to_offset(other) + + if isinstance(other, Tick): + return self.delta == other.delta + else: + return DateOffset.__eq__(self, other) + + # This is identical to DateOffset.__hash__, but has to be redefined here + # for Python 3, because we've redefined __eq__. + def __hash__(self): + return hash(self._params()) + + def __ne__(self, other): + if isinstance(other, basestring): + from pandas.tseries.frequencies import to_offset + other = to_offset(other) + + if isinstance(other, Tick): + return self.delta != other.delta + else: + return DateOffset.__ne__(self, other) + + @cache_readonly + def delta(self): + return self.n * self._inc + + @property + def nanos(self): + return _delta_to_nanoseconds(self.delta) + + def apply(self, other): + if isinstance(other, (datetime, timedelta)): + return other + self.delta + elif isinstance(other, type(self)): + return type(self)(self.n + other.n) + + _rule_base = 'undefined' + @property + def rule_code(self): + return self._rule_base + +def _delta_to_tick(delta): + if delta.microseconds == 0: + if delta.seconds == 0: + return Day(delta.days) + else: + seconds = delta.days * 86400 + delta.seconds + if seconds % 3600 == 0: + return Hour(seconds / 3600) + elif seconds % 60 == 0: + return Minute(seconds / 60) + else: + return Second(seconds) + else: + nanos = _delta_to_nanoseconds(delta) + if nanos % 1000000 == 0: + return Milli(nanos // 1000000) + elif nanos % 1000 == 0: + return Micro(nanos // 1000) + else: # pragma: no cover + return Nano(nanos) + +def _delta_to_nanoseconds(delta): + if isinstance(delta, Tick): + delta = delta.delta + return (delta.days * 24 * 60 * 60 * 1000000 + + delta.seconds * 1000000 + + delta.microseconds) * 1000 + +class Day(Tick, CacheableOffset): + _inc = timedelta(1) + _rule_base = 'D' + + def isAnchored(self): + + return False + +class Hour(Tick): + _inc = timedelta(0, 3600) + _rule_base = 'H' + +class Minute(Tick): + _inc = timedelta(0, 60) + _rule_base = 'T' + +class Second(Tick): + _inc = timedelta(0, 1) + _rule_base = 'S' + +class Milli(Tick): + _rule_base = 'L' + +class Micro(Tick): + _inc = timedelta(microseconds=1) + _rule_base = 'U' + +class Nano(Tick): + _inc = 1 + _rule_base = 'N' + +BDay = BusinessDay +BMonthEnd = BusinessMonthEnd +BMonthBegin = BusinessMonthBegin + + +def _get_firstbday(wkday): + """ + wkday is the result of monthrange(year, month) + + If it's a saturday or sunday, increment first business day to reflect this + """ + first = 1 + if wkday == 5: # on Saturday + first = 3 + elif wkday == 6: # on Sunday + first = 2 + return first + + +def generate_range(start=None, end=None, periods=None, + offset=BDay(), time_rule=None): + """ + Generates a sequence of dates corresponding to the specified time + offset. Similar to dateutil.rrule except uses pandas DateOffset + objects to represent time increments + + Parameters + ---------- + start : datetime (default None) + end : datetime (default None) + periods : int, optional + + Note + ---- + * This method is faster for generating weekdays than dateutil.rrule + * At least two of (start, end, periods) must be specified. + * If both start and end are specified, the returned dates will + satisfy start <= date <= end. + + Returns + ------- + dates : generator object + + """ + if time_rule is not None: + from pandas.tseries.frequencies import get_offset + offset = get_offset(time_rule) + + start = to_datetime(start) + end = to_datetime(end) + + if start and not offset.onOffset(start): + start = offset.rollforward(start) + + if end and not offset.onOffset(end): + end = offset.rollback(end) + + if periods is None and end < start: + end = None + periods = 0 + + if end is None: + end = start + (periods - 1) * offset + + if start is None: + start = end - (periods - 1) * offset + + cur = start + + next_date = cur + while cur <= end: + yield cur + + # faster than cur + offset + next_date = offset.apply(cur) + if next_date <= cur: + raise ValueError('Offset %s did not increment date' % offset) + cur = next_date diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py new file mode 100644 index 00000000..0a8c54e1 --- /dev/null +++ b/pandas/tseries/period.py @@ -0,0 +1,1076 @@ +# pylint: disable=E1101,E1103,W0232 +from datetime import datetime, date +import numpy as np + +from pandas.tseries.frequencies import (get_freq_code as _gfc, to_offset, + _month_numbers, FreqGroup) +from pandas.tseries.index import DatetimeIndex, Int64Index +from pandas.tseries.tools import parse_time_string +import pandas.tseries.frequencies as _freq_mod + +import pandas.core.common as com + +from pandas.lib import Timestamp +import pandas.lib as lib +import pandas._period as plib +import pandas._algos as _algos + + +#--------------- +# Period logic + + +def _period_field_accessor(name, alias): + def f(self): + base, mult = _gfc(self.freq) + return plib.get_period_field(alias, self.ordinal, base) + f.__name__ = name + return property(f) + +def _field_accessor(name, alias): + def f(self): + base, mult = _gfc(self.freq) + return plib.get_period_field_arr(alias, self.values, base) + f.__name__ = name + return property(f) + +class Period(object): + + __slots__ = ['freq', 'ordinal'] + + def __init__(self, value=None, freq=None, ordinal=None, + year=None, month=1, quarter=None, day=1, + hour=0, minute=0, second=0): + """ + Represents an period of time + + Parameters + ---------- + value : Period or basestring, default None + The time period represented (e.g., '4Q2005') + freq : str, default None + e.g., 'B' for businessday, ('T', 5) or '5T' for 5 minutes + year : int, default None + month : int, default 1 + quarter : int, default None + day : int, default 1 + hour : int, default 0 + minute : int, default 0 + second : int, default 0 + """ + # freq points to a tuple (base, mult); base is one of the defined + # periods such as A, Q, etc. Every five minutes would be, e.g., + # ('T', 5) but may be passed in as a string like '5T' + + self.freq = None + + # ordinal is the period offset from the gregorian proleptic epoch + self.ordinal = None + + if ordinal is not None and value is not None: + raise ValueError(("Only value or ordinal but not both should be " + "given but not both")) + elif ordinal is not None: + if not com.is_integer(ordinal): + raise ValueError("Ordinal must be an integer") + if freq is None: + raise ValueError('Must supply freq for ordinal value') + self.ordinal = ordinal + + elif value is None: + if freq is None: + raise ValueError("If value is None, freq cannot be None") + + self.ordinal = _ordinal_from_fields(year, month, quarter, day, + hour, minute, second, freq) + + elif isinstance(value, Period): + other = value + if freq is None or _gfc(freq) == _gfc(other.freq): + self.ordinal = other.ordinal + freq = other.freq + else: + converted = other.asfreq(freq) + self.ordinal = converted.ordinal + + elif isinstance(value, basestring) or com.is_integer(value): + if com.is_integer(value): + value = str(value) + + dt, freq = _get_date_and_freq(value, freq) + + elif isinstance(value, datetime): + dt = value + if freq is None: + raise ValueError('Must supply freq for datetime value') + elif isinstance(value, date): + dt = datetime(year=value.year, month=value.month, day=value.day) + if freq is None: + raise ValueError('Must supply freq for datetime value') + else: + msg = "Value must be Period, string, integer, or datetime" + raise ValueError(msg) + + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + + if self.ordinal is None: + self.ordinal = plib.period_ordinal(dt.year, dt.month, dt.day, + dt.hour, dt.minute, dt.second, + base) + + self.freq = _freq_mod._get_freq_str(base) + + def __eq__(self, other): + if isinstance(other, Period): + return (self.ordinal == other.ordinal + and _gfc(self.freq) == _gfc(other.freq)) + return False + + def __hash__(self): + return hash((self.ordinal, self.freq)) + + def __add__(self, other): + if com.is_integer(other): + return Period(ordinal=self.ordinal + other, freq=self.freq) + else: # pragma: no cover + raise TypeError(other) + + def __sub__(self, other): + if com.is_integer(other): + return Period(ordinal=self.ordinal - other, freq=self.freq) + if isinstance(other, Period): + if other.freq != self.freq: + raise ValueError("Cannot do arithmetic with " + "non-conforming periods") + return self.ordinal - other.ordinal + else: # pragma: no cover + raise TypeError(other) + + def asfreq(self, freq, how='E'): + """ + Convert Period to desired frequency, either at the start or end of the + interval + + Parameters + ---------- + freq : string + how : {'E', 'S', 'end', 'start'}, default 'end' + Start or end of the timespan + + Returns + ------- + resampled : Period + """ + how = _validate_end_alias(how) + base1, mult1 = _gfc(self.freq) + base2, mult2 = _gfc(freq) + + if mult2 != 1: + raise ValueError('Only mult == 1 supported') + + end = how == 'E' + new_ordinal = plib.period_asfreq(self.ordinal, base1, base2, end) + + return Period(ordinal=new_ordinal, freq=base2) + + @property + def start_time(self): + return self.to_timestamp(how='S') + + @property + def end_time(self): + return self.to_timestamp(how='E') + + def to_timestamp(self, freq=None, how='S'): + """ + Return the Timestamp at the start/end of the period + + Parameters + ---------- + freq : string or DateOffset, default frequency of PeriodIndex + Target frequency + how: str, default 'S' (start) + 'S', 'E'. Can be aliased as case insensitive + 'Start', 'Finish', 'Begin', 'End' + + Returns + ------- + Timestamp + """ + if freq is None: + base, mult = _gfc(self.freq) + new_val = self + else: + base, mult = _gfc(freq) + new_val = self.asfreq(freq, how) + + dt64 = plib.period_ordinal_to_dt64(new_val.ordinal, base) + ts_freq = _period_rule_to_timestamp_rule(new_val.freq, how=how) + return Timestamp(dt64, offset=to_offset(ts_freq)) + + year = _period_field_accessor('year', 0) + month = _period_field_accessor('month', 3) + day = _period_field_accessor('day', 4) + hour = _period_field_accessor('hour', 5) + minute = _period_field_accessor('minute', 6) + second = _period_field_accessor('second', 7) + weekofyear = _period_field_accessor('week', 8) + week = weekofyear + dayofweek = _period_field_accessor('dayofweek', 10) + weekday = dayofweek + dayofyear = day_of_year = _period_field_accessor('dayofyear', 9) + quarter = _period_field_accessor('quarter', 2) + qyear = _period_field_accessor('qyear', 1) + + @classmethod + def now(cls, freq=None): + return Period(datetime.now(), freq=freq) + + def __repr__(self): + base, mult = _gfc(self.freq) + formatted = plib.period_format(self.ordinal, base) + freqstr = _freq_mod._reverse_period_code_map[base] + return "Period('%s', '%s')" % (formatted, freqstr) + + def __str__(self): + base, mult = _gfc(self.freq) + formatted = plib.period_format(self.ordinal, base) + return ("%s" % formatted) + + def strftime(self, fmt): + """ + Returns the string representation of the :class:`Period`, depending + on the selected :keyword:`format`. :keyword:`format` must be a string + containing one or several directives. The method recognizes the same + directives as the :func:`time.strftime` function of the standard Python + distribution, as well as the specific additional directives ``%f``, + ``%F``, ``%q``. (formatting & docs originally from scikits.timeries) + + +-----------+--------------------------------+-------+ + | Directive | Meaning | Notes | + +===========+================================+=======+ + | ``%a`` | Locale's abbreviated weekday | | + | | name. | | + +-----------+--------------------------------+-------+ + | ``%A`` | Locale's full weekday name. | | + +-----------+--------------------------------+-------+ + | ``%b`` | Locale's abbreviated month | | + | | name. | | + +-----------+--------------------------------+-------+ + | ``%B`` | Locale's full month name. | | + +-----------+--------------------------------+-------+ + | ``%c`` | Locale's appropriate date and | | + | | time representation. | | + +-----------+--------------------------------+-------+ + | ``%d`` | Day of the month as a decimal | | + | | number [01,31]. | | + +-----------+--------------------------------+-------+ + | ``%f`` | 'Fiscal' year without a | \(1) | + | | century as a decimal number | | + | | [00,99] | | + +-----------+--------------------------------+-------+ + | ``%F`` | 'Fiscal' year with a century | \(2) | + | | as a decimal number | | + +-----------+--------------------------------+-------+ + | ``%H`` | Hour (24-hour clock) as a | | + | | decimal number [00,23]. | | + +-----------+--------------------------------+-------+ + | ``%I`` | Hour (12-hour clock) as a | | + | | decimal number [01,12]. | | + +-----------+--------------------------------+-------+ + | ``%j`` | Day of the year as a decimal | | + | | number [001,366]. | | + +-----------+--------------------------------+-------+ + | ``%m`` | Month as a decimal number | | + | | [01,12]. | | + +-----------+--------------------------------+-------+ + | ``%M`` | Minute as a decimal number | | + | | [00,59]. | | + +-----------+--------------------------------+-------+ + | ``%p`` | Locale's equivalent of either | \(3) | + | | AM or PM. | | + +-----------+--------------------------------+-------+ + | ``%q`` | Quarter as a decimal number | | + | | [01,04] | | + +-----------+--------------------------------+-------+ + | ``%S`` | Second as a decimal number | \(4) | + | | [00,61]. | | + +-----------+--------------------------------+-------+ + | ``%U`` | Week number of the year | \(5) | + | | (Sunday as the first day of | | + | | the week) as a decimal number | | + | | [00,53]. All days in a new | | + | | year preceding the first | | + | | Sunday are considered to be in | | + | | week 0. | | + +-----------+--------------------------------+-------+ + | ``%w`` | Weekday as a decimal number | | + | | [0(Sunday),6]. | | + +-----------+--------------------------------+-------+ + | ``%W`` | Week number of the year | \(5) | + | | (Monday as the first day of | | + | | the week) as a decimal number | | + | | [00,53]. All days in a new | | + | | year preceding the first | | + | | Monday are considered to be in | | + | | week 0. | | + +-----------+--------------------------------+-------+ + | ``%x`` | Locale's appropriate date | | + | | representation. | | + +-----------+--------------------------------+-------+ + | ``%X`` | Locale's appropriate time | | + | | representation. | | + +-----------+--------------------------------+-------+ + | ``%y`` | Year without century as a | | + | | decimal number [00,99]. | | + +-----------+--------------------------------+-------+ + | ``%Y`` | Year with century as a decimal | | + | | number. | | + +-----------+--------------------------------+-------+ + | ``%Z`` | Time zone name (no characters | | + | | if no time zone exists). | | + +-----------+--------------------------------+-------+ + | ``%%`` | A literal ``'%'`` character. | | + +-----------+--------------------------------+-------+ + + .. note:: + + (1) + The ``%f`` directive is the same as ``%y`` if the frequency is + not quarterly. + Otherwise, it corresponds to the 'fiscal' year, as defined by + the :attr:`qyear` attribute. + + (2) + The ``%F`` directive is the same as ``%Y`` if the frequency is + not quarterly. + Otherwise, it corresponds to the 'fiscal' year, as defined by + the :attr:`qyear` attribute. + + (3) + The ``%p`` directive only affects the output hour field + if the ``%I`` directive is used to parse the hour. + + (4) + The range really is ``0`` to ``61``; this accounts for leap + seconds and the (very rare) double leap seconds. + + (5) + The ``%U`` and ``%W`` directives are only used in calculations + when the day of the week and the year are specified. + + .. rubric:: Examples + + >>> a = Period(freq='Q@JUL', year=2006, quarter=1) + >>> a.strftime('%F-Q%q') + '2006-Q1' + >>> # Output the last month in the quarter of this date + >>> a.strftime('%b-%Y') + 'Oct-2005' + >>> + >>> a = Period(freq='D', year=2001, month=1, day=1) + >>> a.strftime('%d-%b-%Y') + '01-Jan-2006' + >>> a.strftime('%b. %d, %Y was a %A') + 'Jan. 01, 2001 was a Monday' + """ + base, mult = _gfc(self.freq) + return plib.period_format(self.ordinal, base, fmt) + +def _get_date_and_freq(value, freq): + value = value.upper() + dt, _, reso = parse_time_string(value, freq) + + if freq is None: + if reso == 'year': + freq = 'A' + elif reso == 'quarter': + freq = 'Q' + elif reso == 'month': + freq = 'M' + elif reso == 'day': + freq = 'D' + elif reso == 'hour': + freq = 'H' + elif reso == 'minute': + freq = 'T' + elif reso == 'second': + freq = 'S' + else: + raise ValueError("Invalid frequency or could not infer: %s" % reso) + + return dt, freq + + +def _period_unbox(key, check=None): + ''' + Period-like => int64 + ''' + if not isinstance(key, Period): + key = Period(key, freq=check) + elif check is not None: + if key.freq != check: + raise ValueError("%s is wrong freq" % key) + return np.int64(key.ordinal) + +def _period_unbox_array(arr, check=None): + unboxer = np.frompyfunc(lambda x: _period_unbox(x, check=check), 1, 1) + return unboxer(arr) + +def dt64arr_to_periodarr(data, freq): + if data.dtype != np.dtype('M8[ns]'): + raise ValueError('Wrong dtype: %s' % data.dtype) + + base, mult = _gfc(freq) + return plib.dt64arr_to_periodarr(data.view('i8'), base) + +# --- Period index sketch + + +def _period_index_cmp(opname): + """ + Wrap comparison operations to convert datetime-like to datetime64 + """ + def wrapper(self, other): + if isinstance(other, Period): + func = getattr(self.values, opname) + assert(other.freq == self.freq) + result = func(other.ordinal) + elif isinstance(other, PeriodIndex): + assert(other.freq == self.freq) + return getattr(self.values, opname)(other.values) + else: + other = Period(other, freq=self.freq) + func = getattr(self.values, opname) + result = func(other.ordinal) + + return result + return wrapper + + +_INT64_DTYPE = np.dtype(np.int64) +_NS_DTYPE = np.dtype('M8[ns]') + + +class PeriodIndex(Int64Index): + """ + Immutable ndarray holding ordinal values indicating regular periods in + time such as particular years, quarters, months, etc. A value of 1 is the + period containing the Gregorian proleptic datetime Jan 1, 0001 00:00:00. + This ordinal representation is from the scikits.timeseries project. + + For instance, + # construct period for day 1/1/1 and get the first second + i = Period(year=1,month=1,day=1,freq='D').asfreq('S', 'S') + i.ordinal + ===> 1 + + Index keys are boxed to Period objects which carries the metadata (eg, + frequency information). + + Parameters + ---------- + data : array-like (1-dimensional), optional + Optional period-like data to construct index with + dtype : NumPy dtype (default: i8) + copy : bool + Make a copy of input ndarray + freq : string or period object, optional + One of pandas period strings or corresponding objects + start : starting value, period-like, optional + If data is None, used as the start point in generating regular + period data. + periods : int, optional, > 0 + Number of periods to generate, if generating index. Takes precedence + over end argument + end : end value, period-like, optional + If periods is none, generated index will extend to first conforming + period on or just past end argument + year : int or array, default None + month : int or array, default None + quarter : int or array, default None + day : int or array, default None + hour : int or array, default None + minute : int or array, default None + second : int or array, default None + + Examples + -------- + >>> idx = PeriodIndex(year=year_arr, quarter=q_arr) + + >>> idx2 = PeriodIndex(start='2000', end='2010', freq='A') + """ + _box_scalars = True + + __eq__ = _period_index_cmp('__eq__') + __ne__ = _period_index_cmp('__ne__') + __lt__ = _period_index_cmp('__lt__') + __gt__ = _period_index_cmp('__gt__') + __le__ = _period_index_cmp('__le__') + __ge__ = _period_index_cmp('__ge__') + + def __new__(cls, data=None, ordinal=None, + freq=None, start=None, end=None, periods=None, + copy=False, name=None, + year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None): + + freq = _freq_mod.get_standard_freq(freq) + + if periods is not None: + if com.is_float(periods): + periods = int(periods) + elif not com.is_integer(periods): + raise ValueError('Periods must be a number, got %s' % + str(periods)) + + if data is None: + if ordinal is not None: + data = np.asarray(ordinal, dtype=np.int64) + else: + fields = [year, month, quarter, day, hour, minute, second] + data, freq = cls._generate_range(start, end, periods, + freq, fields) + else: + ordinal, freq = cls._from_arraylike(data, freq) + data = np.array(ordinal, dtype=np.int64, copy=False) + + subarr = data.view(cls) + subarr.name = name + subarr.freq = freq + + return subarr + + @classmethod + def _generate_range(cls, start, end, periods, freq, fields): + field_count = com._count_not_none(*fields) + if com._count_not_none(start, end) > 0: + if field_count > 0: + raise ValueError('Can either instantiate from fields ' + 'or endpoints, but not both') + subarr, freq = _get_ordinal_range(start, end, periods, freq) + elif field_count > 0: + y, mth, q, d, h, minute, s = fields + subarr, freq = _range_from_fields(year=y, month=mth, quarter=q, + day=d, hour=h, minute=minute, + second=s, freq=freq) + else: + raise ValueError('Not enough parameters to construct ' + 'Period range') + + return subarr, freq + + @classmethod + def _from_arraylike(cls, data, freq): + if not isinstance(data, np.ndarray): + if np.isscalar(data) or isinstance(data, Period): + raise ValueError('PeriodIndex() must be called with a ' + 'collection of some kind, %s was passed' + % repr(data)) + + # other iterable of some kind + if not isinstance(data, (list, tuple)): + data = list(data) + + try: + data = np.array(data, dtype='i8') + except (TypeError, ValueError): + data = np.array(data, dtype='O') + + if freq is None and len(data) > 0: + freq = getattr(data[0], 'freq', None) + + if freq is None: + raise ValueError(('freq not specified and cannot be inferred ' + 'from first element')) + + data = _period_unbox_array(data, check=freq) + else: + if isinstance(data, PeriodIndex): + if freq is None or freq == data.freq: + freq = data.freq + data = data.values + else: + base1, _ = _gfc(data.freq) + base2, _ = _gfc(freq) + data = plib.period_asfreq_arr(data.values, base1, base2, 1) + else: + if freq is None and len(data) > 0: + freq = getattr(data[0], 'freq', None) + + if freq is None: + raise ValueError(('freq not specified and cannot be ' + 'inferred from first element')) + + if np.issubdtype(data.dtype, np.datetime64): + data = dt64arr_to_periodarr(data, freq) + elif data.dtype == np.int64: + pass + else: + try: + data = data.astype('i8') + except (TypeError, ValueError): + data = data.astype('O') + data = _period_unbox_array(data, check=freq) + + return data, freq + + def __contains__(self, key): + if not isinstance(key, Period) or key.freq != self.freq: + if isinstance(key, basestring): + try: + self.get_loc(key) + return True + except Exception: + return False + return False + return key.ordinal in self._engine + + def astype(self, dtype): + dtype = np.dtype(dtype) + if dtype == np.object_: + result = np.empty(len(self), dtype=dtype) + result[:] = [x for x in self] + return result + elif dtype == _INT64_DTYPE: + return self.values.copy() + else: # pragma: no cover + raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) + + def __iter__(self): + for val in self.values: + yield Period(ordinal=val, freq=self.freq) + + @property + def is_all_dates(self): + return True + + @property + def is_full(self): + """ + Returns True if there are any missing periods from start to end + """ + if len(self) == 0: + return True + if not self.is_monotonic: + raise ValueError('Index is not monotonic') + values = self.values + return ((values[1:] - values[:-1]) < 2).all() + + + @property + def freqstr(self): + return self.freq + + def asfreq(self, freq=None, how='E'): + how = _validate_end_alias(how) + + freq = _freq_mod.get_standard_freq(freq) + + base1, mult1 = _gfc(self.freq) + base2, mult2 = _gfc(freq) + + if mult2 != 1: + raise ValueError('Only mult == 1 supported') + + end = how == 'E' + new_data = plib.period_asfreq_arr(self.values, base1, base2, end) + + result = new_data.view(PeriodIndex) + result.name = self.name + result.freq = freq + return result + + year = _field_accessor('year', 0) + month = _field_accessor('month', 3) + day = _field_accessor('day', 4) + hour = _field_accessor('hour', 5) + minute = _field_accessor('minute', 6) + second = _field_accessor('second', 7) + weekofyear = _field_accessor('week', 8) + week = weekofyear + dayofweek = _field_accessor('dayofweek', 10) + weekday = dayofweek + dayofyear = day_of_year = _field_accessor('dayofyear', 9) + quarter = _field_accessor('quarter', 2) + qyear = _field_accessor('qyear', 1) + + # Try to run function on index first, and then on elements of index + # Especially important for group-by functionality + def map(self, f): + try: + return f(self) + except: + values = self._get_object_array() + return _algos.arrmap_object(values, f) + + def _get_object_array(self): + freq = self.freq + boxfunc = lambda x: Period(ordinal=x, freq=freq) + boxer = np.frompyfunc(boxfunc, 1, 1) + return boxer(self.values) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self._get_object_array() + + def to_timestamp(self, freq=None, how='start'): + """ + Cast to DatetimeIndex + + Parameters + ---------- + freq : string or DateOffset, default 'D' + Target frequency + how : {'s', 'e', 'start', 'end'} + + Returns + ------- + DatetimeIndex + """ + if freq is None: + base, mult = _gfc(self.freq) + new_data = self + else: + base, mult = _gfc(freq) + new_data = self.asfreq(freq, how) + + new_data = plib.periodarr_to_dt64arr(new_data.values, base) + return DatetimeIndex(new_data, freq='infer', name=self.name) + + def shift(self, n): + """ + Specialized shift which produces an PeriodIndex + + Parameters + ---------- + n : int + Periods to shift by + freq : freq string + + Returns + ------- + shifted : PeriodIndex + """ + if n == 0: + return self + + return PeriodIndex(data=self.values + n, freq=self.freq) + + def __add__(self, other): + return PeriodIndex(ordinal=self.values + other, freq=self.freq) + + def __sub__(self, other): + return PeriodIndex(ordinal=self.values - other, freq=self.freq) + + @property + def inferred_type(self): + # b/c data is represented as ints make sure we can't have ambiguous + # indexing + return 'period' + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing + """ + try: + return super(PeriodIndex, self).get_value(series, key) + except (KeyError, IndexError): + try: + asdt, parsed, reso = parse_time_string(key, self.freq) + grp = _freq_mod._infer_period_group(reso) + freqn = _freq_mod._period_group(self.freq) + + vals = self.values + + # if our data is higher resolution than requested key, slice + if grp < freqn: + iv = Period(asdt, freq=(grp,1)) + ord1 = iv.asfreq(self.freq, how='S').ordinal + ord2 = iv.asfreq(self.freq, how='E').ordinal + + if ord2 < vals[0] or ord1 > vals[-1]: + raise KeyError(key) + + pos = np.searchsorted(self.values, [ord1, ord2]) + key = slice(pos[0], pos[1]+1) + return series[key] + else: + key = Period(asdt, freq=self.freq) + return self._engine.get_value(series, key.ordinal) + except TypeError: + pass + except KeyError: + pass + + key = Period(key, self.freq) + return self._engine.get_value(series, key.ordinal) + + def get_loc(self, key): + """ + Get integer location for requested label + + Returns + ------- + loc : int + """ + try: + return self._engine.get_loc(key) + except KeyError: + try: + asdt, parsed, reso = parse_time_string(key, self.freq) + key = asdt + except TypeError: + pass + + key = Period(key, self.freq).ordinal + return self._engine.get_loc(key) + + def join(self, other, how='left', level=None, return_indexers=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + result = Int64Index.join(self, other, how=how, level=level, + return_indexers=return_indexers) + + if return_indexers: + result, lidx, ridx = result + return self._apply_meta(result), lidx, ridx + else: + return self._apply_meta(result) + + def _assert_can_do_setop(self, other): + if not isinstance(other, PeriodIndex): + raise ValueError('can only call with other PeriodIndex-ed objects') + + if self.freq != other.freq: + raise ValueError('Only like-indexed PeriodIndexes compatible ' + 'for join (for now)') + + def _wrap_union_result(self, other, result): + name = self.name if self.name == other.name else None + result = self._apply_meta(result) + result.name = name + return result + + def _apply_meta(self, rawarr): + idx = rawarr.view(PeriodIndex) + idx.freq = self.freq + return idx + + def __getitem__(self, key): + """Override numpy.ndarray's __getitem__ method to work as desired""" + arr_idx = self.view(np.ndarray) + if np.isscalar(key): + val = arr_idx[key] + return Period(ordinal=val, freq=self.freq) + else: + if com._is_bool_indexer(key): + key = np.asarray(key) + + result = arr_idx[key] + if result.ndim > 1: + # MPL kludge + # values = np.asarray(list(values), dtype=object) + # return values.reshape(result.shape) + + return PeriodIndex(result, name=self.name, freq=self.freq) + + return PeriodIndex(result, name=self.name, freq=self.freq) + + def format(self, name=False): + """ + Render a string representation of the Index + """ + header = [] + + if name: + header.append(str(self.name) if self.name is not None else '') + + return header + ['%s' % Period(x, freq=self.freq) for x in self] + + def __array_finalize__(self, obj): + if self.ndim == 0: # pragma: no cover + return self.item() + + self.freq = getattr(obj, 'freq', None) + + def __repr__(self): + output = str(self.__class__) + '\n' + output += 'freq: ''%s''\n' % self.freq + if len(self) > 0: + output += '[%s, ..., %s]\n' % (self[0], self[-1]) + output += 'length: %d' % len(self) + return output + + def take(self, indices, axis=None): + """ + Analogous to ndarray.take + """ + taken = self.values.take(indices, axis=axis) + taken = taken.view(PeriodIndex) + taken.freq = self.freq + taken.name = self.name + return taken + + +def _get_ordinal_range(start, end, periods, freq): + if com._count_not_none(start, end, periods) < 2: + raise ValueError('Must specify 2 of start, end, periods') + + if start is not None: + start = Period(start, freq) + if end is not None: + end = Period(end, freq) + + is_start_per = isinstance(start, Period) + is_end_per = isinstance(end, Period) + + if is_start_per and is_end_per and (start.freq != end.freq): + raise ValueError('Start and end must have same freq') + + if freq is None: + if is_start_per: + freq = start.freq + elif is_end_per: + freq = end.freq + else: # pragma: no cover + raise ValueError('Could not infer freq from start/end') + + if periods is not None: + if start is None: + data = np.arange(end.ordinal - periods + 1, + end.ordinal + 1, + dtype=np.int64) + else: + data = np.arange(start.ordinal, start.ordinal + periods, + dtype=np.int64) + else: + data = np.arange(start.ordinal, end.ordinal+1, dtype=np.int64) + + return data, freq + +def _range_from_fields(year=None, month=None, quarter=None, day=None, + hour=None, minute=None, second=None, freq=None): + if hour is None: + hour = 0 + if minute is None: + minute = 0 + if second is None: + second = 0 + if day is None: + day = 1 + + ordinals = [] + + if quarter is not None: + if freq is None: + freq = 'Q' + base = FreqGroup.FR_QTR + else: + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + assert(base == FreqGroup.FR_QTR) + + year, quarter = _make_field_arrays(year, quarter) + for y, q in zip(year, quarter): + y, m = _quarter_to_myear(y, q, freq) + val = plib.period_ordinal(y, m, 1, 1, 1, 1, base) + ordinals.append(val) + else: + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + + arrays = _make_field_arrays(year, month, day, hour, minute, second) + for y, mth, d, h, mn, s in zip(*arrays): + ordinals.append(plib.period_ordinal(y, mth, d, h, mn, s, base)) + + return np.array(ordinals, dtype=np.int64), freq + +def _make_field_arrays(*fields): + length = None + for x in fields: + if isinstance(x, (list, np.ndarray)): + if length is not None and len(x) != length: + raise ValueError('Mismatched Period array lengths') + elif length is None: + length = len(x) + + arrays = [np.asarray(x) if isinstance(x, (np.ndarray, list)) + else np.repeat(x, length) for x in fields] + + return arrays + + +def _ordinal_from_fields(year, month, quarter, day, hour, minute, + second, freq): + base, mult = _gfc(freq) + if mult != 1: + raise ValueError('Only mult == 1 supported') + + if quarter is not None: + year, month = _quarter_to_myear(year, quarter, freq) + + return plib.period_ordinal(year, month, day, hour, minute, second, base) + +def _quarter_to_myear(year, quarter, freq): + if quarter is not None: + if quarter <= 0 or quarter > 4: + raise ValueError('Quarter must be 1 <= q <= 4') + + mnum = _month_numbers[_freq_mod._get_rule_month(freq)] + 1 + month = (mnum + (quarter - 1) * 3) % 12 + 1 + if month > mnum: + year -= 1 + + return year, month + + +def _validate_end_alias(how): + how_dict = {'S': 'S', 'E': 'E', + 'START': 'S', 'FINISH': 'E', + 'BEGIN': 'S', 'END': 'E'} + how = how_dict.get(str(how).upper()) + if how not in set(['S', 'E']): + raise ValueError('How must be one of S or E') + return how + +def pnow(freq=None): + return Period(datetime.now(), freq=freq) + +def period_range(start=None, end=None, periods=None, freq='D'): + """ + Return a fixed frequency datetime index, with day (calendar) as the default + frequency + + + Parameters + ---------- + start : + end : + normalize : bool, default False + Normalize start/end dates to midnight before generating date range + + Returns + ------- + + """ + return PeriodIndex(start=start, end=end, periods=periods, + freq=freq) + +def _period_rule_to_timestamp_rule(freq, how='end'): + how = how.lower() + if how in ('end', 'e'): + return freq + else: + if freq.startswith('A-') or freq.startswith('BA-'): + base, color = freq.split('-') + return '%sS-%s' % (base, color) + return freq diff --git a/pandas/tseries/plotting.py b/pandas/tseries/plotting.py new file mode 100644 index 00000000..678c272b --- /dev/null +++ b/pandas/tseries/plotting.py @@ -0,0 +1,181 @@ +""" +Adapted from scikits.timeseries by Pierre GF Gerard-Marchant & Matt Knox +""" + +#!!! TODO: Use the fact that axis can have units to simplify the process +import datetime as pydt +from datetime import datetime + +from matplotlib import pylab +import matplotlib.units as units + +import numpy as np + +from pandas import isnull +from pandas.tseries.period import Period +from pandas.tseries.offsets import DateOffset +import pandas.tseries.frequencies as frequencies +from pandas.tseries.index import DatetimeIndex +import pandas.core.common as com + +from pandas.tseries.converter import (PeriodConverter, TimeSeries_DateLocator, + TimeSeries_DateFormatter) + +units.registry[Period] = PeriodConverter() +#---------------------------------------------------------------------- +# Plotting functions and monkey patches + +def tsplot(series, plotf, **kwargs): + """ + Plots a Series on the given Matplotlib axes or the current axes + + Parameters + ---------- + axes : Axes + series : Series + + Notes + _____ + Supports same kwargs as Axes.plot + + """ + # Used inferred freq is possible, need a test case for inferred + if 'ax' in kwargs: + ax = kwargs.pop('ax') + else: + import matplotlib.pyplot as plt + ax = plt.gca() + + freq = _get_freq(ax, series) + # resample against axes freq if necessary + if freq is None: # pragma: no cover + raise ValueError('Cannot use dynamic axis without frequency info') + else: + ax_freq = getattr(ax, 'freq', None) + if (ax_freq is not None) and (freq != ax_freq): + if frequencies.is_subperiod(freq, ax_freq): # downsample + how = kwargs.pop('how', 'last') + series = series.resample(ax_freq, how=how) + elif frequencies.is_superperiod(freq, ax_freq): + series = series.resample(ax_freq) + else: # one freq is weekly + how = kwargs.pop('how', 'last') + series = series.resample('D', how=how, fill_method='pad') + series = series.resample(ax_freq, how=how, fill_method='pad') + freq = ax_freq + + # Convert DatetimeIndex to PeriodIndex + if isinstance(series.index, DatetimeIndex): + series = series.to_period(freq=freq) + + style = kwargs.pop('style', None) + + # Specialized ts plotting attributes for Axes + ax.freq = freq + xaxis = ax.get_xaxis() + xaxis.freq = freq + ax.legendlabels = [kwargs.get('label', None)] + ax.view_interval = None + ax.date_axis_info = None + + # format args and lot + args = _maybe_mask(series) + + if style is not None: + args.append(style) + + plotf(ax, *args, **kwargs) + + format_dateaxis(ax, ax.freq) + + left, right = _get_xlim(ax.get_lines()) + ax.set_xlim(left, right) + + return ax + +def _maybe_mask(series): + mask = isnull(series) + if mask.any(): + masked_array = np.ma.array(series.values) + masked_array = np.ma.masked_where(mask, masked_array) + args = [series.index, masked_array] + else: + args = [series.index, series] + return args + +def _get_freq(ax, series): + # get frequency from data + freq = getattr(series.index, 'freq', None) + if freq is None: + freq = getattr(series.index, 'inferred_freq', None) + + ax_freq = getattr(ax, 'freq', None) + + # use axes freq if no data freq + if freq is None: + freq = ax_freq + + # get the period frequency + if isinstance(freq, DateOffset): + freq = freq.rule_code + else: + freq = frequencies.get_base_alias(freq) + + freq = frequencies.get_period_alias(freq) + + return freq + +def _get_xlim(lines): + left, right = np.inf, -np.inf + for l in lines: + x = l.get_xdata() + left = min(x[0].ordinal, left) + right = max(x[-1].ordinal, right) + return left, right + +def get_datevalue(date, freq): + if isinstance(date, Period): + return date.asfreq(freq).ordinal + elif isinstance(date, (str, datetime, pydt.date, pydt.time)): + return Period(date, freq).ordinal + elif (com.is_integer(date) or com.is_float(date) or + (isinstance(date, np.ndarray) and (date.size == 1))): + return date + elif date is None: + return None + raise ValueError("Unrecognizable date '%s'" % date) + +# Patch methods for subplot. Only format_dateaxis is currently used. +# Do we need the rest for convenience? + +def format_dateaxis(subplot, freq): + """ + Pretty-formats the date axis (x-axis). + + Major and minor ticks are automatically set for the frequency of the + current underlying series. As the dynamic mode is activated by + default, changing the limits of the x axis will intelligently change + the positions of the ticks. + """ + majlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, + minor_locator=False, + plot_obj=subplot) + minlocator = TimeSeries_DateLocator(freq, dynamic_mode=True, + minor_locator=True, + plot_obj=subplot) + subplot.xaxis.set_major_locator(majlocator) + subplot.xaxis.set_minor_locator(minlocator) + + majformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minor_locator=False, + plot_obj=subplot) + minformatter = TimeSeries_DateFormatter(freq, dynamic_mode=True, + minor_locator=True, + plot_obj=subplot) + subplot.xaxis.set_major_formatter(majformatter) + subplot.xaxis.set_minor_formatter(minformatter) + pylab.draw_if_interactive() + + + + diff --git a/pandas/tseries/resample.py b/pandas/tseries/resample.py new file mode 100644 index 00000000..a2b1c203 --- /dev/null +++ b/pandas/tseries/resample.py @@ -0,0 +1,329 @@ +from datetime import timedelta + +import numpy as np + +from pandas.core.groupby import BinGrouper, CustomGrouper +from pandas.tseries.frequencies import to_offset, is_subperiod, is_superperiod +from pandas.tseries.index import DatetimeIndex, date_range +from pandas.tseries.offsets import DateOffset, Tick, _delta_to_nanoseconds +from pandas.tseries.period import PeriodIndex, period_range +import pandas.core.common as com + +from pandas.lib import Timestamp +import pandas.lib as lib + + +class TimeGrouper(CustomGrouper): + """ + Custom groupby class for time-interval grouping + + Parameters + ---------- + rule : pandas offset string or object for identifying bin edges + closed : closed end of interval; left (default) or right + label : interval boundary to use for labeling; left (default) or right + nperiods : optional, integer + convention : {'start', 'end', 'e', 's'} + If axis is PeriodIndex + + Notes + ----- + Use begin, end, nperiods to generate intervals that cannot be derived + directly from the associated object + """ + def __init__(self, freq='Min', closed='right', label='right', how='mean', + nperiods=None, axis=0, + fill_method=None, limit=None, loffset=None, kind=None, + convention=None, base=0): + self.freq = freq + self.closed = closed + self.label = label + self.nperiods = nperiods + self.kind = kind + self.convention = convention or 'E' + self.axis = axis + self.loffset = loffset + self.how = how + self.fill_method = fill_method + self.limit = limit + self.base = base + + def resample(self, obj): + axis = obj._get_axis(self.axis) + if isinstance(axis, DatetimeIndex): + return self._resample_timestamps(obj) + elif isinstance(axis, PeriodIndex): + offset = to_offset(self.freq) + if offset.n > 1: + if self.kind == 'period': # pragma: no cover + print 'Warning: multiple of frequency -> timestamps' + # Cannot have multiple of periods, convert to timestamp + self.kind = 'timestamp' + + if self.kind is None or self.kind == 'period': + return self._resample_periods(obj) + else: + obj = obj.to_timestamp(how=self.convention) + return self._resample_timestamps(obj) + else: # pragma: no cover + raise TypeError('Only valid with DatetimeIndex or PeriodIndex') + + def get_grouper(self, obj): + # Only return grouper + return self._get_time_grouper(obj)[1] + + def _get_time_grouper(self, obj): + axis = obj._get_axis(self.axis) + + if self.kind is None or self.kind == 'timestamp': + binner, bins, binlabels = self._get_time_bins(axis) + else: + binner, bins, binlabels = self._get_time_period_bins(axis) + + grouper = BinGrouper(bins, binlabels) + return binner, grouper + + def _get_time_bins(self, axis): + assert(isinstance(axis, DatetimeIndex)) + + if len(axis) == 0: + binner = labels = DatetimeIndex(data=[], freq=self.freq) + return binner, [], labels + + first, last = _get_range_edges(axis, self.freq, closed=self.closed, + base=self.base) + binner = labels = DatetimeIndex(freq=self.freq, start=first, end=last) + + # a little hack + trimmed = False + if len(binner) > 2 and binner[-2] == axis[-1] and self.closed == 'right': + binner = binner[:-1] + trimmed = True + + ax_values = axis.asi8 + binner, bin_edges = self._adjust_bin_edges(binner, ax_values) + + # general version, knowing nothing about relative frequencies + bins = lib.generate_bins_dt64(ax_values, bin_edges, self.closed) + + if self.closed == 'right': + labels = binner + if self.label == 'right': + labels = labels[1:] + elif not trimmed: + labels = labels[:-1] + else: + if self.label == 'right': + labels = labels[1:] + elif not trimmed: + labels = labels[:-1] + + return binner, bins, labels + + def _adjust_bin_edges(self, binner, ax_values): + # Some hacks for > daily data, see #1471, #1458, #1483 + + bin_edges = binner.asi8 + + if self.freq != 'D' and is_superperiod(self.freq, 'D'): + day_nanos = _delta_to_nanoseconds(timedelta(1)) + if self.closed == 'right': + bin_edges = bin_edges + day_nanos - 1 + else: + bin_edges = bin_edges + day_nanos + + # intraday values on last day + if bin_edges[-2] > ax_values[-1]: + bin_edges = bin_edges[:-1] + binner = binner[:-1] + + return binner, bin_edges + + def _get_time_period_bins(self, axis): + assert(isinstance(axis, DatetimeIndex)) + + if len(axis) == 0: + binner = labels = PeriodIndex(data=[], freq=self.freq) + return binner, [], labels + + labels = binner = PeriodIndex(start=axis[0], end=axis[-1], + freq=self.freq) + + end_stamps = (labels + 1).asfreq('D', 's').to_timestamp() + bins = axis.searchsorted(end_stamps, side='left') + + return binner, bins, labels + + def _resample_timestamps(self, obj): + axlabels = obj._get_axis(self.axis) + + binner, grouper = self._get_time_grouper(obj) + + # Determine if we're downsampling + if axlabels.freq is not None or axlabels.inferred_freq is not None: + if len(grouper.binlabels) < len(axlabels): + grouped = obj.groupby(grouper, axis=self.axis) + result = grouped.aggregate(self.how) + else: + # upsampling shortcut + assert(self.axis == 0) + result = obj.reindex(binner[1:], method=self.fill_method, + limit=self.limit) + else: + # Irregular data, have to use groupby + grouped = obj.groupby(grouper, axis=self.axis) + result = grouped.aggregate(self.how) + + if self.fill_method is not None: + result = result.fillna(method=self.fill_method, limit=self.limit) + + loffset = self.loffset + if isinstance(loffset, basestring): + loffset = to_offset(self.loffset) + + if isinstance(loffset, (DateOffset, timedelta)): + if (isinstance(result.index, DatetimeIndex) + and len(result.index) > 0): + + result.index = result.index + loffset + + return result + + def _resample_periods(self, obj): + axlabels = obj._get_axis(self.axis) + + if len(axlabels) == 0: + new_index = PeriodIndex(data=[], freq=self.freq) + return obj.reindex(new_index) + else: + start = axlabels[0].asfreq(self.freq, how=self.convention) + end = axlabels[-1].asfreq(self.freq, how=self.convention) + new_index = period_range(start, end, freq=self.freq) + + # Start vs. end of period + memb = axlabels.asfreq(self.freq, how=self.convention) + + if is_subperiod(axlabels.freq, self.freq): + # Downsampling + rng = np.arange(memb.values[0], memb.values[-1]) + bins = memb.searchsorted(rng, side='right') + grouper = BinGrouper(bins, new_index) + + grouped = obj.groupby(grouper, axis=self.axis) + return grouped.aggregate(self.how) + elif is_superperiod(axlabels.freq, self.freq): + # Get the fill indexer + indexer = memb.get_indexer(new_index, method=self.fill_method, + limit=self.limit) + + return _take_new_index(obj, indexer, new_index, axis=self.axis) + else: + raise ValueError('Frequency %s cannot be resampled to %s' + % (axlabels.freq, self.freq)) + + +def _take_new_index(obj, indexer, new_index, axis=0): + from pandas.core.api import Series, DataFrame + from pandas.core.internals import BlockManager + + if isinstance(obj, Series): + new_values = com.take_1d(obj.values, indexer) + return Series(new_values, index=new_index, name=obj.name) + elif isinstance(obj, DataFrame): + if axis == 1: + raise NotImplementedError + data = obj._data + + new_blocks = [b.take(indexer, axis=1) for b in data.blocks] + new_axes = list(data.axes) + new_axes[1] = new_index + new_data = BlockManager(new_blocks, new_axes) + return DataFrame(new_data) + else: + raise NotImplementedError + + + +def _get_range_edges(axis, offset, closed='left', base=0): + if isinstance(offset, basestring): + offset = to_offset(offset) + + if isinstance(offset, Tick): + day_nanos = _delta_to_nanoseconds(timedelta(1)) + # #1165 + if (day_nanos % offset.nanos) == 0: + return _adjust_dates_anchored(axis[0], axis[-1], offset, + closed=closed, base=base) + + if closed == 'left': + first = Timestamp(offset.rollback(axis[0])) + else: + first = Timestamp(axis[0] - offset) + + last = Timestamp(axis[-1] + offset) + + return first, last + + +def _adjust_dates_anchored(first, last, offset, closed='right', base=0): + from pandas.tseries.tools import normalize_date + + start_day_nanos = Timestamp(normalize_date(first)).value + last_day_nanos = Timestamp(normalize_date(last)).value + + base_nanos = (base % offset.n) * offset.nanos // offset.n + start_day_nanos += base_nanos + last_day_nanos += base_nanos + + foffset = (first.value - start_day_nanos) % offset.nanos + loffset = (last.value - last_day_nanos) % offset.nanos + + if closed == 'right': + if foffset > 0: + # roll back + fresult = first.value - foffset + else: + fresult = first.value - offset.nanos + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + # already the end of the road + lresult = last.value + else: # closed == 'left' + if foffset > 0: + fresult = first.value - foffset + else: + # start of the road + fresult = first.value + + if loffset > 0: + # roll forward + lresult = last.value + (offset.nanos - loffset) + else: + lresult = last.value + offset.nanos + + return Timestamp(fresult), Timestamp(lresult) + + +def asfreq(obj, freq, method=None, how=None): + """ + Utility frequency conversion method for Series/DataFrame + """ + if isinstance(obj.index, PeriodIndex): + if method is not None: + raise NotImplementedError + + if how is None: + how = 'E' + + new_index = obj.index.asfreq(freq, how=how) + new_obj = obj.copy() + new_obj.index = new_index + return new_obj + else: + if len(obj.index) == 0: + return obj.copy() + dti = date_range(obj.index[0], obj.index[-1], freq=freq) + return obj.reindex(dti, method=method) diff --git a/pandas/tseries/tests/__init__.py b/pandas/tseries/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/tseries/tests/data/daterange_073.pickle b/pandas/tseries/tests/data/daterange_073.pickle new file mode 100644 index 0000000000000000000000000000000000000000..0214a023e6338dce54e6daf8b3d94a7275baca66 GIT binary patch literal 650 zcmZY7OH0E*5C`z4PrH3+U)q<}*CAed_9jSAE<`BoQDl>BZ848-vOy{q^blLWo!>~) zb{G%N!ZQ3A=JKESwB<$ad@;2AKn&f;Q8OL{d_f)qVfkLDg2+-tYSx^4HV=1WHdi9x z-jg7sq#JKLnWm|jY36DyGdk61Gu|yGwpz>uky)0$zosdwB?CE~W|;P77{=XCQrnN- zDD&$<=5=ecUCmrUu#p8u3g22LwXJw8_oh3^q7*@LCj=6X`nPgnkX%h7Rn(=8|4V3gVF}+qI5udC|!^~N>3;w{`{A; z@_i>Hx1;1JWdG_z9xvsI&WfHNxZIh&3OQJ_yg!+QLdny=^fnRN!cm;avn2QACCQ(& Y?DLBq%8RAEoDS9@(>$t0rm-@Izk(m6nE(I) literal 0 HcmV?d00001 diff --git a/pandas/tseries/tests/data/frame.pickle b/pandas/tseries/tests/data/frame.pickle new file mode 100644 index 0000000000000000000000000000000000000000..b3b100fb43022faf7bd0b949238988afc7de53bf GIT binary patch literal 1182 zcmZo*N-jvuOGzx&OU^G!)k`Z%%uVHTNi0cp1G5SlH5&_2m1O3Xq!#5R<`i=|<>V)4 z`zGckrl%Hh6*2==vcwj$Y9!~C<`z^!%*!p!DalMMDoU*6iZ4n{&d)0@DJo4a;VNW9 zu{JX=CAEUfGq1$V#1qUcWcOxh4P{Jf4=Uu)@MiR8ZH1W1l~Ph!kjhoa8OoGt;mzR9 z2voqO;msV%XyfPS=k*^5z=StLNm6I11_Kl@LTM%_%?zbkpmd2}YgY7m%J$PITE56D?utru>CGx=D$H7fJyWf^==6j7BJDUVc$-VoqjNYN2dL zC|iD7T5)Pgp&TM4K*5ocnp2XJQig0taVTS+H)Cm% zUwcw&Y@sqRmcZ$Y3z%rZ>8el#9w(~cq~guh28xw5SgfewNFN;`6M*TW;fH?PbI(qD zpGB$Jd180$pY_~~+OhDdefBm(`J$sw?Iq%0oobwM+g|&L=xlec2ljnYDGD!n-q>g8 zUwoA0f6#t{RL7q$(nsuHeSXyJ?tR(*#=mDLA1WNNXXv=rfBD92`>Bn3vYT!_x3`GD za8a>$i~TLpIhKblp4tZ;l`Kf#cV$0_Toy_unf!A`-q_VZL0AG-eV zsr|Adk2*u+Yxb8^`t(}$F4?E}MdinQzi+=`r_oc77x(PTws(Kna^b%HBhDjt=Q_T$ z&pcMAp>pVf{gs02oSzq6vX}X#bi8QMeftj!9HuX9TW+tmz5In8^E>;!XC?%#<#=n~ z!1O`X*y@NqU;2%8|888h*FUt3f9K|R_OmDVNaSvOY(Fu@LfxqNfxVtm!`}<-H-Wh& PF}6@WgCns$DM=3iVMVf9 literal 0 HcmV?d00001 diff --git a/pandas/tseries/tests/data/series.pickle b/pandas/tseries/tests/data/series.pickle new file mode 100644 index 0000000000000000000000000000000000000000..307a4ac26517384a8267b23a8891e4cf919a20d8 GIT binary patch literal 646 zcmZo*O3o|IEvVE>&M!*U%Pq|*$xJLNO049HFG@|$&nqq|DorloDr8J9NX$z~EQTm6 zPA$qzE#?Zz%uNl3FbkQy8CpXbliGs{nKir_y}4Q;#&V^UR2HOi6|#gfrCE40cryYO zuxfZShcepu`T2SM2LdqR%}|om85;0gre(H&?L+&pe^2_|L|)soNw4(YaOa)<>%zvK zYfcaC6He?G)>?kezSjQ7#_DSi>^Cm|Q~hw#b9(`k2lLpcEV8#d5t>_a^o4!SJ<&Jf z{KxD|GEg0!l30>jl$e*E%H;xN1%X+GY;dQuL!6!gbge(kwH#pA)}Xr99_ZTGLQaij zkbxz@VBmr?3b{hL*sn4&Gk`&BP$72)M1%z{!UGjyg^Tb)McCjXd{7Z~xClQ~gbOYr z02SeeiwHtRc;F&JP!V3Zh%i)y4=y5-TH@E*h7!YI@8sv_6mvPb024!@sAglKSZ$%W zMkr@qeo<~>PG(hVp+rY0TYg$vacW7SBqAh0!I6@hQ!OOZ59U9nA?q0TAECTVJ&E1=s@p_O5Y99O<1%Q zB;BptG-d=J1D0mc)|$GRnB*5MVgwiVVm1}zl5>9Zji(6111*Nz)~! zRiE7aziNM|S|k1TkABe8-^TPSq(1=Pb|Z~nI#^obZBbmI= 28) + self.assert_(t.month == (12 if i == 0 else i)) + self.assert_(t.year == 2001 + (i != 0)) + off.next() + + for i in range(11, -1, -1): + off.prev() + t = lib.Timestamp(off.ts) + self.assert_(t.day >= 28) + self.assert_(t.month == (12 if i == 0 else i)) + self.assert_(t.year == 2001 + (i != 0)) + + off = lib.MonthOffset(dayoffset=-1, biz=-1, anchor=datetime(2002,1,1)) + + stack = [] + + for i in range(500): + t = lib.Timestamp(off.ts) + stack.append(t) + if t.month != 2: + self.assert_(t.day >= 28) + else: + self.assert_(t.day >= 26) + self.assert_(t.weekday() < 5) + off.next() + + for i in range(499, -1, -1): + off.prev() + t = lib.Timestamp(off.ts) + self.assert_(t == stack.pop()) + if t.month != 2: + self.assert_(t.day >= 28) + else: + self.assert_(t.day >= 26) + self.assert_(t.weekday() < 5) + + for i in (-2, -1, 1, 2): + for j in (-1, 0, 1): + off1 = lib.MonthOffset(dayoffset=i, biz=j, stride=12, + anchor=datetime(2002,1,1)) + off2 = lib.YearOffset(dayoffset=i, biz=j, + anchor=datetime(2002,1,1)) + + for k in range(500): + self.assert_(off1.ts == off2.ts) + off1.next() + off2.next() + + for k in range(500): + self.assert_(off1.ts == off2.ts) + off1.prev() + off2.prev() + + def test_dayoffset(self): + off = lib.DayOffset(biz=0, anchor=datetime(2002,1,1)) + + us_in_day = 1e6 * 60 * 60 * 24 + + t0 = lib.Timestamp(off.ts) + for i in range(500): + off.next() + t1 = lib.Timestamp(off.ts) + self.assert_(t1.value - t0.value == us_in_day) + t0 = t1 + + t0 = lib.Timestamp(off.ts) + for i in range(499, -1, -1): + off.prev() + t1 = lib.Timestamp(off.ts) + self.assert_(t0.value - t1.value == us_in_day) + t0 = t1 + + off = lib.DayOffset(biz=1, anchor=datetime(2002,1,1)) + + t0 = lib.Timestamp(off.ts) + for i in range(500): + off.next() + t1 = lib.Timestamp(off.ts) + self.assert_(t1.weekday() < 5) + self.assert_(t1.value - t0.value == us_in_day or + t1.value - t0.value == 3 * us_in_day) + t0 = t1 + + t0 = lib.Timestamp(off.ts) + for i in range(499, -1, -1): + off.prev() + t1 = lib.Timestamp(off.ts) + self.assert_(t1.weekday() < 5) + self.assert_(t0.value - t1.value == us_in_day or + t0.value - t1.value == 3 * us_in_day) + t0 = t1 + + + def test_dayofmonthoffset(self): + for week in (-1, 0, 1): + for day in (0, 2, 4): + off = lib.DayOfMonthOffset(week=-1, day=day, + anchor=datetime(2002,1,1)) + + stack = [] + + for i in range(500): + t = lib.Timestamp(off.ts) + stack.append(t) + self.assert_(t.weekday() == day) + off.next() + + for i in range(499, -1, -1): + off.prev() + t = lib.Timestamp(off.ts) + self.assert_(t == stack.pop()) + self.assert_(t.weekday() == day) + + +""" diff --git a/pandas/tseries/tests/test_daterange.py b/pandas/tseries/tests/test_daterange.py new file mode 100644 index 00000000..625eadbc --- /dev/null +++ b/pandas/tseries/tests/test_daterange.py @@ -0,0 +1,304 @@ +from datetime import datetime +import pickle +import unittest + +import numpy as np + +from pandas.core.index import Index +from pandas.tseries.index import DatetimeIndex + +from pandas import Timestamp +from pandas.tseries.offsets import generate_range +from pandas.tseries.index import bdate_range, date_range +import pandas.tseries.tools as tools + +import pandas.core.datetools as datetools + +def eq_gen_range(kwargs, expected): + rng = generate_range(**kwargs) + assert(np.array_equal(list(rng), expected)) + +START, END = datetime(2009, 1, 1), datetime(2010, 1, 1) + +class TestGenRangeGeneration(unittest.TestCase): + def test_generate(self): + rng1 = list(generate_range(START, END, offset=datetools.bday)) + rng2 = list(generate_range(START, END, time_rule='B')) + self.assert_(np.array_equal(rng1, rng2)) + + def test_1(self): + eq_gen_range(dict(start=datetime(2009, 3, 25), periods=2), + [datetime(2009, 3, 25), datetime(2009, 3, 26)]) + + def test_2(self): + eq_gen_range(dict(start=datetime(2008, 1, 1), + end=datetime(2008, 1, 3)), + [datetime(2008, 1, 1), + datetime(2008, 1, 2), + datetime(2008, 1, 3)]) + + def test_3(self): + eq_gen_range(dict(start = datetime(2008, 1, 5), + end = datetime(2008, 1, 6)), + []) + +class TestDateRange(unittest.TestCase): + + def setUp(self): + self.rng = bdate_range(START, END) + + def test_constructor(self): + rng = bdate_range(START, END, freq=datetools.bday) + rng = bdate_range(START, periods=20, freq=datetools.bday) + rng = bdate_range(end=START, periods=20, freq=datetools.bday) + self.assertRaises(ValueError, date_range, '2011-1-1', '2012-1-1', 'B') + self.assertRaises(ValueError, bdate_range, '2011-1-1', '2012-1-1', 'B') + + def test_cached_range(self): + rng = DatetimeIndex._cached_range(START, END, + offset=datetools.bday) + rng = DatetimeIndex._cached_range(START, periods=20, + offset=datetools.bday) + rng = DatetimeIndex._cached_range(end=START, periods=20, + offset=datetools.bday) + + self.assertRaises(Exception, DatetimeIndex._cached_range, START, END) + + self.assertRaises(Exception, DatetimeIndex._cached_range, START, + freq=datetools.bday) + + self.assertRaises(Exception, DatetimeIndex._cached_range, end=END, + freq=datetools.bday) + + self.assertRaises(Exception, DatetimeIndex._cached_range, periods=20, + freq=datetools.bday) + + def test_cached_range_bug(self): + rng = date_range('2010-09-01 05:00:00', periods=50, + freq=datetools.DateOffset(hours=6)) + self.assertEquals(len(rng), 50) + self.assertEquals(rng[0], datetime(2010, 9, 1, 5)) + + def test_comparison(self): + d = self.rng[10] + + comp = self.rng > d + self.assert_(comp[11]) + self.assert_(not comp[9]) + + def test_copy(self): + cp = self.rng.copy() + repr(cp) + self.assert_(cp.equals(self.rng)) + + def test_repr(self): + # only really care that it works + repr(self.rng) + + def test_getitem(self): + smaller = self.rng[:5] + self.assert_(np.array_equal(smaller, self.rng.view(np.ndarray)[:5])) + self.assertEquals(smaller.offset, self.rng.offset) + + sliced = self.rng[::5] + self.assertEquals(sliced.offset, datetools.bday * 5) + + fancy_indexed = self.rng[[4, 3, 2, 1, 0]] + self.assertEquals(len(fancy_indexed), 5) + self.assert_(isinstance(fancy_indexed, DatetimeIndex)) + self.assert_(fancy_indexed.freq is None) + + # 32-bit vs. 64-bit platforms + self.assertEquals(self.rng[4], self.rng[np.int_(4)]) + + def test_getitem_matplotlib_hackaround(self): + values = self.rng[:, None] + expected = self.rng.values[:, None] + self.assert_(np.array_equal(values, expected)) + + def test_shift(self): + shifted = self.rng.shift(5) + self.assertEquals(shifted[0], self.rng[5]) + self.assertEquals(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(-5) + self.assertEquals(shifted[5], self.rng[0]) + self.assertEquals(shifted.offset, self.rng.offset) + + shifted = self.rng.shift(0) + self.assertEquals(shifted[0], self.rng[0]) + self.assertEquals(shifted.offset, self.rng.offset) + + rng = date_range(START, END, freq=datetools.bmonthEnd) + shifted = rng.shift(1, freq=datetools.bday) + self.assertEquals(shifted[0], rng[0] + datetools.bday) + + def test_pickle_unpickle(self): + pickled = pickle.dumps(self.rng) + unpickled = pickle.loads(pickled) + + self.assert_(unpickled.offset is not None) + + def test_union(self): + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_union = left.union(right) + self.assert_(isinstance(the_union, DatetimeIndex)) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_union = left.union(right) + self.assert_(isinstance(the_union, Index)) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_union = left.union(right) + self.assert_(isinstance(the_union, DatetimeIndex)) + + # order does not matter + self.assert_(np.array_equal(right.union(left), the_union)) + + # overlapping, but different offset + rng = date_range(START, END, freq=datetools.bmonthEnd) + + the_union = self.rng.union(rng) + self.assert_(isinstance(the_union, DatetimeIndex)) + + def test_outer_join(self): + # should just behave as union + + # overlapping + left = self.rng[:10] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + self.assert_(isinstance(the_join, DatetimeIndex)) + + # non-overlapping, gap in middle + left = self.rng[:5] + right = self.rng[10:] + + the_join = left.join(right, how='outer') + self.assert_(isinstance(the_join, DatetimeIndex)) + self.assert_(the_join.freq is None) + + # non-overlapping, no gap + left = self.rng[:5] + right = self.rng[5:10] + + the_join = left.join(right, how='outer') + self.assert_(isinstance(the_join, DatetimeIndex)) + + # overlapping, but different offset + rng = date_range(START, END, freq=datetools.bmonthEnd) + + the_join = self.rng.join(rng, how='outer') + self.assert_(isinstance(the_join, DatetimeIndex)) + self.assert_(the_join.freq is None) + + def test_union_not_cacheable(self): + rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_union = rng1.union(rng2) + self.assert_(the_union.equals(rng)) + + rng1 = rng[10:] + rng2 = rng[15:35] + the_union = rng1.union(rng2) + expected = rng[10:] + self.assert_(the_union.equals(expected)) + + def test_intersection(self): + rng = date_range('1/1/2000', periods=50, freq=datetools.Minute()) + rng1 = rng[10:] + rng2 = rng[:25] + the_int = rng1.intersection(rng2) + expected = rng[10:25] + self.assert_(the_int.equals(expected)) + self.assert_(isinstance(the_int, DatetimeIndex)) + self.assert_(the_int.offset == rng.offset) + + the_int = rng1.intersection(rng2.view(DatetimeIndex)) + self.assert_(the_int.equals(expected)) + + # non-overlapping + the_int = rng[:10].intersection(rng[10:]) + expected = DatetimeIndex([]) + self.assert_(the_int.equals(expected)) + + def test_intersection_bug(self): + # GH #771 + a = bdate_range('11/30/2011','12/31/2011') + b = bdate_range('12/10/2011','12/20/2011') + result = a.intersection(b) + self.assert_(result.equals(b)) + + def test_summary(self): + self.rng.summary() + self.rng[2:2].summary() + try: + import pytz + bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc).summary() + except Exception: + pass + + def test_misc(self): + end = datetime(2009, 5, 13) + dr = bdate_range(end=end, periods=20) + firstDate = end - 19 * datetools.bday + + assert len(dr) == 20 + assert dr[0] == firstDate + assert dr[-1] == end + + def test_date_parse_failure(self): + badly_formed_date = '2007/100/1' + + self.assertRaises(ValueError, Timestamp, badly_formed_date) + + self.assertRaises(ValueError, bdate_range, start=badly_formed_date, + periods=10) + self.assertRaises(ValueError, bdate_range, end=badly_formed_date, + periods=10) + self.assertRaises(ValueError, bdate_range, badly_formed_date, + badly_formed_date) + + def test_equals(self): + self.assertFalse(self.rng.equals(list(self.rng))) + + def test_daterange_bug_456(self): + # GH #456 + rng1 = bdate_range('12/5/2011', '12/5/2011') + rng2 = bdate_range('12/2/2011', '12/5/2011') + rng2.offset = datetools.BDay() + + result = rng1.union(rng2) + self.assert_(isinstance(result, DatetimeIndex)) + + def test_error_with_zero_monthends(self): + self.assertRaises(ValueError, date_range, '1/1/2000', '1/1/2001', + freq=datetools.MonthEnd(0)) + + def test_range_bug(self): + # GH #770 + offset = datetools.DateOffset(months=3) + result = date_range("2011-1-1", "2012-1-31", freq=offset) + + start = datetime(2011, 1, 1) + exp_values = [start + i * offset for i in range(5)] + self.assert_(np.array_equal(result, DatetimeIndex(exp_values))) + + + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_frequencies.py b/pandas/tseries/tests/test_frequencies.py new file mode 100644 index 00000000..622bc585 --- /dev/null +++ b/pandas/tseries/tests/test_frequencies.py @@ -0,0 +1,209 @@ +from datetime import datetime, time, timedelta +import sys +import os +import unittest + +import nose + +import numpy as np + +from pandas import Index, DatetimeIndex, date_range + +from pandas.tseries.frequencies import to_offset, infer_freq +from pandas.tseries.tools import to_datetime +import pandas.tseries.frequencies as fmod +import pandas.tseries.offsets as offsets + +import pandas.lib as lib + +def test_to_offset_multiple(): + freqstr = '2h30min' + freqstr2 = '2h 30min' + + result = to_offset(freqstr) + assert(result == to_offset(freqstr2)) + expected = offsets.Minute(150) + assert(result == expected) + + freqstr = '2h30min15s' + result = to_offset(freqstr) + expected = offsets.Second(150 * 60 + 15) + assert(result == expected) + + freqstr = '2h 60min' + result = to_offset(freqstr) + expected = offsets.Hour(3) + assert(result == expected) + + freqstr = '15l500u' + result = to_offset(freqstr) + expected = offsets.Micro(15500) + assert(result == expected) + + freqstr = '10s75L' + result = to_offset(freqstr) + expected = offsets.Milli(10075) + assert(result == expected) + + # malformed + try: + to_offset('2h20m') + except ValueError: + pass + else: + assert(False) + +def test_to_offset_negative(): + freqstr = '-1S' + result = to_offset(freqstr) + assert(result.n == -1) + + freqstr='-5min10s' + result = to_offset(freqstr) + assert(result.n == -310) + + +def test_anchored_shortcuts(): + result = to_offset('W') + expected = to_offset('W-SUN') + assert(result == expected) + + result = to_offset('Q') + expected = to_offset('Q-DEC') + assert(result == expected) + + +_dti = DatetimeIndex + +class TestFrequencyInference(unittest.TestCase): + + def test_raise_if_too_few(self): + index = _dti(['12/31/1998', '1/3/1999']) + self.assertRaises(ValueError, infer_freq, index) + + def test_business_daily(self): + index = _dti(['12/31/1998', '1/3/1999', '1/4/1999']) + self.assert_(infer_freq(index) == 'B') + + def test_day(self): + self._check_tick(timedelta(1), 'D') + + def test_day_corner(self): + index = _dti(['1/1/2000', '1/2/2000', '1/3/2000']) + self.assert_(infer_freq(index) == 'D') + + def test_non_datetimeindex(self): + dates = to_datetime(['1/1/2000', '1/2/2000', '1/3/2000']) + self.assert_(infer_freq(dates) == 'D') + + def test_hour(self): + self._check_tick(timedelta(hours=1), 'H') + + def test_minute(self): + self._check_tick(timedelta(minutes=1), 'T') + + def test_second(self): + self._check_tick(timedelta(seconds=1), 'S') + + def test_millisecond(self): + self._check_tick(timedelta(microseconds=1000), 'L') + + def test_microsecond(self): + self._check_tick(timedelta(microseconds=1), 'U') + + def test_nanosecond(self): + idx = DatetimeIndex(np.arange(0, 100, 10)) + inferred = idx.inferred_freq + + self.assert_(inferred == '10N') + + def _check_tick(self, base_delta, code): + b = datetime.now() + for i in range(1, 5): + inc = base_delta * i + index = _dti([b + inc * j for j in range(3)]) + if i > 1: + exp_freq = '%d%s' % (i, code) + else: + exp_freq = code + self.assert_(infer_freq(index) == exp_freq) + + index = _dti([b + base_delta * 7] + + [b + base_delta * j for j in range(3)]) + self.assert_(infer_freq(index) is None) + + index = _dti([b + base_delta * j for j in range(3)] + + [b + base_delta * 7]) + self.assert_(infer_freq(index) is None) + + def test_weekly(self): + days = ['MON', 'TUE', 'WED', 'THU', 'FRI'] + + for day in days: + self._check_generated_range('1/1/2000', 'W-%s' % day) + + def test_monthly(self): + self._check_generated_range('1/1/2000', 'M') + + def test_monthly_ambiguous(self): + rng = _dti(['1/31/2000', '2/29/2000', '3/31/2000']) + self.assert_(rng.inferred_freq == 'M') + + def test_business_monthly(self): + self._check_generated_range('1/1/2000', 'BM') + + def test_business_start_monthly(self): + self._check_generated_range('1/1/2000', 'BMS') + + def test_quarterly(self): + for month in ['JAN', 'FEB', 'MAR']: + self._check_generated_range('1/1/2000', 'Q-%s' % month) + + def test_annual(self): + for month in MONTHS: + self._check_generated_range('1/1/2000', 'A-%s' % month) + + def test_business_annual(self): + for month in MONTHS: + self._check_generated_range('1/1/2000', 'BA-%s' % month) + + def test_annual_ambiguous(self): + rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) + self.assert_(rng.inferred_freq == 'A-JAN') + + def _check_generated_range(self, start, freq): + freq = freq.upper() + + gen = date_range(start, periods=7, freq=freq) + index = _dti(gen.values) + self.assert_(infer_freq(index) == gen.freqstr) + + gen = date_range(start, periods=5, freq=freq) + index = _dti(gen.values) + self.assert_(infer_freq(index) == gen.freqstr) + + def test_not_monotonic(self): + rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) + rng = rng[::-1] + self.assert_(rng.inferred_freq is None) + + def test_non_datetimeindex(self): + rng = _dti(['1/31/2000', '1/31/2001', '1/31/2002']) + + vals = rng.to_pydatetime() + + result = infer_freq(vals) + self.assertEqual(result, rng.inferred_freq) + +MONTHS = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', + 'OCT', 'NOV', 'DEC'] + +def test_is_superperiod_subperiod(): + assert(fmod.is_superperiod(offsets.YearEnd(), offsets.MonthEnd())) + assert(fmod.is_subperiod(offsets.MonthEnd(), offsets.YearEnd())) + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py new file mode 100644 index 00000000..1311b9ca --- /dev/null +++ b/pandas/tseries/tests/test_offsets.py @@ -0,0 +1,1403 @@ +from datetime import datetime, timedelta +import unittest +import numpy as np + +from pandas.core.datetools import ( + bday, BDay, BQuarterEnd, BMonthEnd, BYearEnd, MonthEnd, MonthBegin, + BYearBegin, QuarterBegin, BQuarterBegin, BMonthBegin, + DateOffset, Week, YearBegin, YearEnd, Hour, Minute, Second, + WeekOfMonth, format, ole2datetime, QuarterEnd, to_datetime, normalize_date, + get_offset, get_offset_name, inferTimeRule, hasOffsetName, + get_standard_freq) + +from pandas.tseries.frequencies import _offset_map +from pandas.tseries.index import _to_m8 +from pandas.tseries.tools import parse_time_string +import pandas.tseries.offsets as offsets + +from nose.tools import assert_raises + +import pandas.lib as lib +from pandas.lib import Timestamp + +def test_monthrange(): + import calendar + for y in range(2000,2013): + for m in range(1,13): + assert lib.monthrange(y,m) == calendar.monthrange(y,m) + +#### +## Misc function tests +#### +def test_format(): + actual = format(datetime(2008, 1, 15)) + assert actual == '20080115' + +def test_ole2datetime(): + actual = ole2datetime(60000) + assert actual == datetime(2064, 4, 8) + + assert_raises(Exception, ole2datetime, 60) + +def test_to_datetime1(): + actual = to_datetime(datetime(2008, 1, 15)) + assert actual == datetime(2008, 1, 15) + + actual = to_datetime('20080115') + assert actual == datetime(2008, 1, 15) + + # unparseable + s = 'Month 1, 1999' + assert to_datetime(s) == s + +def test_normalize_date(): + actual = normalize_date(datetime(2007, 10, 1, 1, 12, 5, 10)) + assert actual == datetime(2007, 10, 1) + +def test_to_m8(): + valb = datetime(2007, 10, 1) + valu = _to_m8(valb) + assert type(valu) == np.datetime64 + #assert valu == np.datetime64(datetime(2007,10,1)) + +#def test_datetime64_box(): +# valu = np.datetime64(datetime(2007,10,1)) +# valb = _dt_box(valu) +# assert type(valb) == datetime +# assert valb == datetime(2007,10,1) + +##### +### DateOffset Tests +##### + +class TestDateOffset(unittest.TestCase): + + def setUp(self): + self.d = Timestamp(datetime(2008, 1, 2)) + + def test_repr(self): + repr(DateOffset()) + repr(DateOffset(2)) + repr(2 * DateOffset()) + repr(2 * DateOffset(months=2)) + + def test_mul(self): + assert DateOffset(2) == 2 * DateOffset(1) + assert DateOffset(2) == DateOffset(1) * 2 + + def test_constructor(self): + + assert((self.d + DateOffset(months=2)) == datetime(2008, 3, 2)) + assert((self.d - DateOffset(months=2)) == datetime(2007, 11, 2)) + + assert((self.d + DateOffset(2)) == datetime(2008, 1, 4)) + + assert not DateOffset(2).isAnchored() + assert DateOffset(1).isAnchored() + + d = datetime(2008, 1, 31) + assert((d + DateOffset(months=1)) == datetime(2008, 2, 29)) + + def test_copy(self): + assert(DateOffset(months=2).copy() == DateOffset(months=2)) + + def test_eq(self): + offset1 = DateOffset(days=1) + offset2 = DateOffset(days=365) + + self.assert_(offset1 != offset2) + self.assert_(not (offset1 == offset2)) + +class TestBusinessDay(unittest.TestCase): + + def setUp(self): + self.d = datetime(2008, 1, 1) + + self.offset = BDay() + self.offset2 = BDay(2) + + def test_repr(self): + assert repr(self.offset) == '<1 BusinessDay>' + assert repr(self.offset2) == '<2 BusinessDays>' + + expected = '<1 BusinessDay: offset=datetime.timedelta(1)>' + assert repr(self.offset + timedelta(1)) == expected + + def test_with_offset(self): + offset = self.offset + timedelta(hours=2) + + assert (self.d + offset) == datetime(2008, 1, 2, 2) + + def testEQ(self): + self.assertEqual(self.offset2, self.offset2) + + def test_mul(self): + pass + + def test_hash(self): + self.assertEqual(hash(self.offset2), hash(self.offset2)) + + def testCall(self): + self.assertEqual(self.offset2(self.d), datetime(2008, 1, 3)) + + def testRAdd(self): + self.assertEqual(self.d + self.offset2, self.offset2 + self.d) + + def testSub(self): + off = self.offset2 + self.assertRaises(Exception, off.__sub__, self.d) + self.assertEqual(2 * off - off, off) + + self.assertEqual(self.d - self.offset2, self.d + BDay(-2)) + + def testRSub(self): + self.assertEqual(self.d - self.offset2, (-self.offset2).apply(self.d)) + + def testMult1(self): + self.assertEqual(self.d + 10*self.offset, self.d + BDay(10)) + + def testMult2(self): + self.assertEqual(self.d + (-5*BDay(-10)), + self.d + BDay(50)) + + + def testRollback1(self): + self.assertEqual(BDay(10).rollback(self.d), self.d) + + def testRollback2(self): + self.assertEqual(BDay(10).rollback(datetime(2008, 1, 5)), datetime(2008, 1, 4)) + + def testRollforward1(self): + self.assertEqual(BDay(10).rollforward(self.d), self.d) + + def testRollforward2(self): + self.assertEqual(BDay(10).rollforward(datetime(2008, 1, 5)), datetime(2008, 1, 7)) + + def test_onOffset(self): + tests = [(BDay(), datetime(2008, 1, 1), True), + (BDay(), datetime(2008, 1, 5), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_apply(self): + tests = [] + + tests.append((bday, + {datetime(2008, 1, 1): datetime(2008, 1, 2), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 8)})) + + tests.append((2*bday, + {datetime(2008, 1, 1): datetime(2008, 1, 3), + datetime(2008, 1, 4): datetime(2008, 1, 8), + datetime(2008, 1, 5): datetime(2008, 1, 8), + datetime(2008, 1, 6): datetime(2008, 1, 8), + datetime(2008, 1, 7): datetime(2008, 1, 9)})) + + tests.append((-bday, + {datetime(2008, 1, 1): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 3), + datetime(2008, 1, 5): datetime(2008, 1, 4), + datetime(2008, 1, 6): datetime(2008, 1, 4), + datetime(2008, 1, 7): datetime(2008, 1, 4), + datetime(2008, 1, 8): datetime(2008, 1, 7)})) + + tests.append((-2*bday, + {datetime(2008, 1, 1): datetime(2007, 12, 28), + datetime(2008, 1, 4): datetime(2008, 1, 2), + datetime(2008, 1, 5): datetime(2008, 1, 3), + datetime(2008, 1, 6): datetime(2008, 1, 3), + datetime(2008, 1, 7): datetime(2008, 1, 3), + datetime(2008, 1, 8): datetime(2008, 1, 4), + datetime(2008, 1, 9): datetime(2008, 1, 7)})) + + tests.append((BDay(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 1, 4): datetime(2008, 1, 4), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_apply_corner(self): + self.assertRaises(Exception, BDay().apply, BMonthEnd()) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BDay() + offset2 = BDay() + self.assertFalse(offset1 != offset2) + +def assertOnOffset(offset, date, expected): + actual = offset.onOffset(date) + assert actual == expected + +class TestWeek(unittest.TestCase): + def test_corner(self): + self.assertRaises(Exception, Week, weekday=7) + self.assertRaises(Exception, Week, weekday=-1) + + def test_isAnchored(self): + self.assert_(Week(weekday=0).isAnchored()) + self.assert_(not Week().isAnchored()) + self.assert_(not Week(2, weekday=2).isAnchored()) + self.assert_(not Week(2).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((Week(), # not business week + {datetime(2008, 1, 1): datetime(2008, 1, 8), + datetime(2008, 1, 4): datetime(2008, 1, 11), + datetime(2008, 1, 5): datetime(2008, 1, 12), + datetime(2008, 1, 6): datetime(2008, 1, 13), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + tests.append((Week(weekday=0), # Mon + {datetime(2007, 12, 31): datetime(2008, 1, 7), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 14)})) + + tests.append((Week(0, weekday=0), # n=0 -> roll forward. Mon + {datetime(2007, 12, 31): datetime(2007, 12, 31), + datetime(2008, 1, 4): datetime(2008, 1, 7), + datetime(2008, 1, 5): datetime(2008, 1, 7), + datetime(2008, 1, 6): datetime(2008, 1, 7), + datetime(2008, 1, 7): datetime(2008, 1, 7)})) + + tests.append((Week(-2, weekday=1), # n=0 -> roll forward. Mon + {datetime(2010, 4, 6): datetime(2010, 3, 23), + datetime(2010, 4, 8): datetime(2010, 3, 30), + datetime(2010, 4, 5): datetime(2010, 3, 23)})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_onOffset(self): + for weekday in range(7): + offset = Week(weekday=weekday) + + for day in range(1, 8): + date = datetime(2008, 1, day) + + if day % 7 == weekday: + expected = True + else: + expected = False + assertOnOffset(offset, date, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = Week() + offset2 = Week() + self.assertFalse(offset1 != offset2) + +class TestWeekOfMonth(unittest.TestCase): + + def test_constructor(self): + self.assertRaises(Exception, WeekOfMonth, n=0, week=1, weekday=1) + self.assertRaises(Exception, WeekOfMonth, n=1, week=4, weekday=0) + self.assertRaises(Exception, WeekOfMonth, n=1, week=-1, weekday=0) + self.assertRaises(Exception, WeekOfMonth, n=1, week=0, weekday=-1) + self.assertRaises(Exception, WeekOfMonth, n=1, week=0, weekday=7) + + def test_offset(self): + date1 = datetime(2011, 1, 4) # 1st Tuesday of Month + date2 = datetime(2011, 1, 11) # 2nd Tuesday of Month + date3 = datetime(2011, 1, 18) # 3rd Tuesday of Month + date4 = datetime(2011, 1, 25) # 4th Tuesday of Month + + # see for loop for structure + test_cases = [ + (-2, 2, 1, date1, datetime(2010, 11, 16)), + (-2, 2, 1, date2, datetime(2010, 11, 16)), + (-2, 2, 1, date3, datetime(2010, 11, 16)), + (-2, 2, 1, date4, datetime(2010, 12, 21)), + + (-1, 2, 1, date1, datetime(2010, 12, 21)), + (-1, 2, 1, date2, datetime(2010, 12, 21)), + (-1, 2, 1, date3, datetime(2010, 12, 21)), + (-1, 2, 1, date4, datetime(2011, 1, 18)), + + (1, 0, 0, date1, datetime(2011, 2, 7)), + (1, 0, 0, date2, datetime(2011, 2, 7)), + (1, 0, 0, date3, datetime(2011, 2, 7)), + (1, 0, 0, date4, datetime(2011, 2, 7)), + (1, 0, 1, date1, datetime(2011, 2, 1)), + (1, 0, 1, date2, datetime(2011, 2, 1)), + (1, 0, 1, date3, datetime(2011, 2, 1)), + (1, 0, 1, date4, datetime(2011, 2, 1)), + (1, 0, 2, date1, datetime(2011, 1, 5)), + (1, 0, 2, date2, datetime(2011, 2, 2)), + (1, 0, 2, date3, datetime(2011, 2, 2)), + (1, 0, 2, date4, datetime(2011, 2, 2)), + + (1, 2, 1, date1, datetime(2011, 1, 18)), + (1, 2, 1, date2, datetime(2011, 1, 18)), + (1, 2, 1, date3, datetime(2011, 2, 15)), + (1, 2, 1, date4, datetime(2011, 2, 15)), + + (2, 2, 1, date1, datetime(2011, 2, 15)), + (2, 2, 1, date2, datetime(2011, 2, 15)), + (2, 2, 1, date3, datetime(2011, 3, 15)), + (2, 2, 1, date4, datetime(2011, 3, 15)), + ] + + for n, week, weekday, date, expected in test_cases: + offset = WeekOfMonth(n, week=week, weekday=weekday) + assertEq(offset, date, expected) + + # try subtracting + result = datetime(2011, 2, 1) - WeekOfMonth(week=1, weekday=2) + self.assertEqual(result, datetime(2011, 1, 12)) + result = datetime(2011, 2, 3) - WeekOfMonth(week=0, weekday=2) + self.assertEqual(result, datetime(2011, 2, 2)) + + def test_onOffset(self): + test_cases = [ + (0, 0, datetime(2011, 2, 7), True), + (0, 0, datetime(2011, 2, 6), False), + (0, 0, datetime(2011, 2, 14), False), + (1, 0, datetime(2011, 2, 14), True), + (0, 1, datetime(2011, 2, 1), True), + (0, 1, datetime(2011, 2, 8), False), + ] + + for week, weekday, date, expected in test_cases: + offset = WeekOfMonth(week=week, weekday=weekday) + self.assert_(offset.onOffset(date) == expected) + +class TestBMonthBegin(unittest.TestCase): + def test_offset(self): + tests = [] + + tests.append((BMonthBegin(), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 1): datetime(2006, 10, 2), + datetime(2007, 1, 1): datetime(2007, 2, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1)})) + + tests.append((BMonthBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 10, 2): datetime(2006, 10, 2), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2006, 12, 29): datetime(2007, 1, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 9, 15): datetime(2006, 10, 2)})) + + tests.append((BMonthBegin(2), + {datetime(2008, 1, 1): datetime(2008, 3, 3), + datetime(2008, 1, 15): datetime(2008, 3, 3), + datetime(2006, 12, 29): datetime(2007, 2, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + tests.append((BMonthBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 6, 30): datetime(2008, 6, 2), + datetime(2008, 6, 1): datetime(2008, 5, 1), + datetime(2008, 3, 10): datetime(2008, 3, 3), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 12, 30): datetime(2006, 12, 1), + datetime(2007, 1, 1): datetime(2006, 12, 1)})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [(BMonthBegin(), datetime(2007, 12, 31), False), + (BMonthBegin(), datetime(2008, 1, 1), True), + (BMonthBegin(), datetime(2001, 4, 2), True), + (BMonthBegin(), datetime(2008, 3, 3), True)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthBegin() + offset2 = BMonthBegin() + self.assertFalse(offset1 != offset2) + + +class TestBMonthEnd(unittest.TestCase): + + def test_offset(self): + tests = [] + + tests.append((BMonthEnd(), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 29)})) + + tests.append((BMonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 29), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + tests.append((BMonthEnd(2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 2, 28), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 29)})) + + tests.append((BMonthEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2008, 5, 30), + datetime(2008, 12, 31): datetime(2008, 11, 28), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29)})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + BMonthEnd() + expected = dt.replace(hour=0) + BMonthEnd() + self.assertEqual(result, expected) + + def test_onOffset(self): + + tests = [(BMonthEnd(), datetime(2007, 12, 31), True), + (BMonthEnd(), datetime(2008, 1, 1), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + + def test_offsets_compare_equal(self): + # root cause of #456 + offset1 = BMonthEnd() + offset2 = BMonthEnd() + self.assertFalse(offset1 != offset2) + +class TestMonthBegin(unittest.TestCase): + + def test_offset(self): + tests = [] + + #NOTE: I'm not entirely happy with the logic here for Begin -ss + #see thread 'offset conventions' on the ML + tests.append((MonthBegin(), + {datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 2, 1): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 1, 1), + datetime(2006, 12, 1): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + tests.append((MonthBegin(0), + {datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2006, 12, 3): datetime(2007, 1, 1), + datetime(2007, 1, 31): datetime(2007, 2, 1)})) + + tests.append((MonthBegin(2), + {datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 3, 1), + datetime(2006, 12, 31): datetime(2007, 2, 1), + datetime(2007, 12, 28): datetime(2008, 2, 1), + datetime(2007, 1, 1): datetime(2007, 3, 1), + datetime(2006, 11, 1): datetime(2007, 1, 1)})) + + tests.append((MonthBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 1), + datetime(2008, 5, 31): datetime(2008, 5, 1), + datetime(2008, 12, 31): datetime(2008, 12, 1), + datetime(2006, 12, 29): datetime(2006, 12, 1), + datetime(2006, 1, 2): datetime(2006, 1, 1)})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + +class TestMonthEnd(unittest.TestCase): + + def test_offset(self): + tests = [] + + tests.append((MonthEnd(), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2007, 1, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31), + datetime(2006, 12, 1): datetime(2006, 12, 31)})) + + tests.append((MonthEnd(0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2006, 12, 29): datetime(2006, 12, 31), + datetime(2006, 12, 31): datetime(2006, 12, 31), + datetime(2007, 1, 1): datetime(2007, 1, 31)})) + + tests.append((MonthEnd(2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 3, 31), + datetime(2006, 12, 29): datetime(2007, 1, 31), + datetime(2006, 12, 31): datetime(2007, 2, 28), + datetime(2007, 1, 1): datetime(2007, 2, 28), + datetime(2006, 11, 1): datetime(2006, 12, 31)})) + + tests.append((MonthEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2008, 5, 31), + datetime(2008, 12, 31): datetime(2008, 11, 30), + datetime(2006, 12, 29): datetime(2006, 11, 30), + datetime(2006, 12, 30): datetime(2006, 11, 30), + datetime(2007, 1, 1): datetime(2006, 12, 31)})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_normalize(self): + dt = datetime(2007, 1, 1, 3) + + result = dt + MonthEnd() + expected = dt.replace(hour=0) + MonthEnd() + self.assertEqual(result, expected) + + def test_onOffset(self): + + tests = [(MonthEnd(), datetime(2007, 12, 31), True), + (MonthEnd(), datetime(2008, 1, 1), False)] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestBQuarterBegin(unittest.TestCase): + + def test_isAnchored(self): + self.assert_(BQuarterBegin(startingMonth=1).isAnchored()) + self.assert_(BQuarterBegin().isAnchored()) + self.assert_(not BQuarterBegin(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((BQuarterBegin(startingMonth=1), + {datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 1, 31): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2007, 3, 15): datetime(2007, 4, 2), + datetime(2007, 2, 28): datetime(2007, 4, 2), + datetime(2007, 1, 1): datetime(2007, 4, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 7, 2), + datetime(2008, 4, 30): datetime(2008, 7, 1),})) + + tests.append((BQuarterBegin(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 8, 15): datetime(2008, 11, 3), + datetime(2008, 9, 15): datetime(2008, 11, 3), + datetime(2008, 11, 1): datetime(2008, 11, 3), + datetime(2008, 4, 30): datetime(2008, 5, 1),})) + + tests.append((BQuarterBegin(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2007, 12, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 1, 15): datetime(2008, 4, 1), + datetime(2008, 2, 27): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2007, 4, 1): datetime(2007, 4, 2), + datetime(2007, 4, 2): datetime(2007, 4, 2), + datetime(2007, 7, 1): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 7, 2), + datetime(2007, 7, 2): datetime(2007, 7, 2),})) + + tests.append((BQuarterBegin(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2007, 7, 3): datetime(2007, 7, 2), + datetime(2007, 4, 3): datetime(2007, 4, 2), + datetime(2007, 7, 2): datetime(2007, 4, 2), + datetime(2008, 4, 1): datetime(2008, 1, 1),})) + + tests.append((BQuarterBegin(startingMonth=1, n=2), + {datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 1, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2007, 3, 31): datetime(2007, 7, 2), + datetime(2007, 4, 15): datetime(2007, 10, 1), + datetime(2008, 4, 30): datetime(2008, 10, 1),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + # corner + offset = BQuarterBegin(n=-1, startingMonth=1) + self.assertEqual(datetime(2007, 4, 3) + offset, datetime(2007, 4, 2)) + +class TestBQuarterEnd(unittest.TestCase): + + def test_isAnchored(self): + self.assert_(BQuarterEnd(startingMonth=1).isAnchored()) + self.assert_(BQuarterEnd().isAnchored()) + self.assert_(not BQuarterEnd(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((BQuarterEnd(startingMonth=1), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31),})) + + tests.append((BQuarterEnd(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 30), + datetime(2008, 3, 15): datetime(2008, 5, 30), + datetime(2008, 3, 31): datetime(2008, 5, 30), + datetime(2008, 4, 15): datetime(2008, 5, 30), + datetime(2008, 4, 30): datetime(2008, 5, 30),})) + + tests.append((BQuarterEnd(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30),})) + + tests.append((BQuarterEnd(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31),})) + + tests.append((BQuarterEnd(startingMonth=1, n=2), + {datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + # corner + offset = BQuarterEnd(n=-1, startingMonth=1) + self.assertEqual(datetime(2010, 1, 31) + offset, datetime(2010, 1, 29)) + + def test_onOffset(self): + + tests = [(BQuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (BQuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + + (BQuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), True), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (BQuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + + (BQuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), True), + (BQuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestQuarterBegin(unittest.TestCase): + def test_isAnchored(self): + self.assert_(QuarterBegin(startingMonth=1).isAnchored()) + self.assert_(QuarterBegin().isAnchored()) + self.assert_(not QuarterBegin(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((QuarterBegin(startingMonth=1), + {datetime(2007, 12, 1): datetime(2008, 1, 1), + datetime(2008, 1, 1): datetime(2008, 4, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 7, 1), + datetime(2008, 4, 1): datetime(2008, 7, 1),})) + + tests.append((QuarterBegin(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 1), + datetime(2008, 1, 31): datetime(2008, 2, 1), + datetime(2008, 1, 15): datetime(2008, 2, 1), + datetime(2008, 2, 29): datetime(2008, 5, 1), + datetime(2008, 3, 15): datetime(2008, 5, 1), + datetime(2008, 3, 31): datetime(2008, 5, 1), + datetime(2008, 4, 15): datetime(2008, 5, 1), + datetime(2008, 4, 30): datetime(2008, 5, 1),})) + + tests.append((QuarterBegin(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 12, 1): datetime(2009, 1, 1), + datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 4, 1), + datetime(2008, 2, 29): datetime(2008, 4, 1), + datetime(2008, 3, 15): datetime(2008, 4, 1), + datetime(2008, 3, 31): datetime(2008, 4, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1),})) + + tests.append((QuarterBegin(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 1), + datetime(2008, 1, 31): datetime(2008, 1, 1), + datetime(2008, 2, 15): datetime(2008, 1, 1), + datetime(2008, 2, 29): datetime(2008, 1, 1), + datetime(2008, 3, 15): datetime(2008, 1, 1), + datetime(2008, 3, 31): datetime(2008, 1, 1), + datetime(2008, 4, 15): datetime(2008, 4, 1), + datetime(2008, 4, 30): datetime(2008, 4, 1), + datetime(2008, 7, 1): datetime(2008, 4, 1)})) + + tests.append((QuarterBegin(startingMonth=1, n=2), + {datetime(2008, 1, 1): datetime(2008, 7, 1), + datetime(2008, 2, 15): datetime(2008, 7, 1), + datetime(2008, 2, 29): datetime(2008, 7, 1), + datetime(2008, 3, 15): datetime(2008, 7, 1), + datetime(2008, 3, 31): datetime(2008, 7, 1), + datetime(2008, 4, 15): datetime(2008, 10, 1), + datetime(2008, 4, 1): datetime(2008, 10, 1),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + # corner + offset = QuarterBegin(n=-1, startingMonth=1) + self.assertEqual(datetime(2010, 2, 1) + offset, datetime(2010, 1, 1)) + +class TestQuarterEnd(unittest.TestCase): + + def test_isAnchored(self): + self.assert_(QuarterEnd(startingMonth=1).isAnchored()) + self.assert_(QuarterEnd().isAnchored()) + self.assert_(not QuarterEnd(2, startingMonth=1).isAnchored()) + + def test_offset(self): + tests = [] + + tests.append((QuarterEnd(startingMonth=1), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 4, 30), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 7, 31),})) + + tests.append((QuarterEnd(startingMonth=2), + {datetime(2008, 1, 1): datetime(2008, 2, 29), + datetime(2008, 1, 31): datetime(2008, 2, 29), + datetime(2008, 2, 15): datetime(2008, 2, 29), + datetime(2008, 2, 29): datetime(2008, 5, 31), + datetime(2008, 3, 15): datetime(2008, 5, 31), + datetime(2008, 3, 31): datetime(2008, 5, 31), + datetime(2008, 4, 15): datetime(2008, 5, 31), + datetime(2008, 4, 30): datetime(2008, 5, 31),})) + + tests.append((QuarterEnd(startingMonth=1, n=0), + {datetime(2008, 1, 1): datetime(2008, 1, 31), + datetime(2008, 1, 31): datetime(2008, 1, 31), + datetime(2008, 2, 15): datetime(2008, 4, 30), + datetime(2008, 2, 29): datetime(2008, 4, 30), + datetime(2008, 3, 15): datetime(2008, 4, 30), + datetime(2008, 3, 31): datetime(2008, 4, 30), + datetime(2008, 4, 15): datetime(2008, 4, 30), + datetime(2008, 4, 30): datetime(2008, 4, 30),})) + + tests.append((QuarterEnd(startingMonth=1, n=-1), + {datetime(2008, 1, 1): datetime(2007, 10, 31), + datetime(2008, 1, 31): datetime(2007, 10, 31), + datetime(2008, 2, 15): datetime(2008, 1, 31), + datetime(2008, 2, 29): datetime(2008, 1, 31), + datetime(2008, 3, 15): datetime(2008, 1, 31), + datetime(2008, 3, 31): datetime(2008, 1, 31), + datetime(2008, 4, 15): datetime(2008, 1, 31), + datetime(2008, 4, 30): datetime(2008, 1, 31), + datetime(2008, 7, 1): datetime(2008, 4, 30)})) + + tests.append((QuarterEnd(startingMonth=1, n=2), + {datetime(2008, 1, 31): datetime(2008, 7, 31), + datetime(2008, 2, 15): datetime(2008, 7, 31), + datetime(2008, 2, 29): datetime(2008, 7, 31), + datetime(2008, 3, 15): datetime(2008, 7, 31), + datetime(2008, 3, 31): datetime(2008, 7, 31), + datetime(2008, 4, 15): datetime(2008, 7, 31), + datetime(2008, 4, 30): datetime(2008, 10, 31),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + # corner + offset = QuarterEnd(n=-1, startingMonth=1) + self.assertEqual(datetime(2010, 2, 1) + offset, datetime(2010, 1, 31)) + + def test_onOffset(self): + + tests = [(QuarterEnd(1, startingMonth=1), datetime(2008, 1, 31), True), + (QuarterEnd(1, startingMonth=1), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 4, 30), True), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=1), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=1), datetime(2007, 6, 30), False), + + (QuarterEnd(1, startingMonth=2), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 12, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 2, 29), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 3, 31), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=2), datetime(2008, 5, 31), True), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=2), datetime(2007, 6, 30), False), + + (QuarterEnd(1, startingMonth=3), datetime(2008, 1, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 12, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 2, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 3, 31), True), + (QuarterEnd(1, startingMonth=3), datetime(2008, 4, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 30), False), + (QuarterEnd(1, startingMonth=3), datetime(2008, 5, 31), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 29), False), + (QuarterEnd(1, startingMonth=3), datetime(2007, 6, 30), True), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestBYearBegin(unittest.TestCase): + + def test_misspecified(self): + self.assertRaises(ValueError, BYearBegin, month=13) + self.assertRaises(ValueError, BYearEnd, month=13) + + def test_offset(self): + tests = [] + + tests.append((BYearBegin(), + {datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2011, 1, 1) : datetime(2011, 1, 3), + datetime(2011, 1, 3) : datetime(2012, 1, 2), + datetime(2005, 12, 30) : datetime(2006, 1, 2), + datetime(2005, 12, 31) : datetime(2006, 1, 2) + } + )) + + tests.append((BYearBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 2), + datetime(2005, 12, 31): datetime(2006, 1, 2),})) + + tests.append((BYearBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 1, 2), + datetime(2009, 1, 4): datetime(2009, 1, 1), + datetime(2009, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 2), + datetime(2006, 12, 30): datetime(2006, 1, 2), + datetime(2006, 1, 1): datetime(2005, 1, 3),})) + + tests.append((BYearBegin(-2), + {datetime(2007, 1, 1): datetime(2005, 1, 3), + datetime(2007, 6, 30): datetime(2006, 1, 2), + datetime(2008, 12, 31): datetime(2007, 1, 1),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + +class TestYearBegin(unittest.TestCase): + + def test_misspecified(self): + self.assertRaises(ValueError, YearBegin, month=13) + + def test_offset(self): + tests = [] + + tests.append((YearBegin(), + {datetime(2008, 1, 1): datetime(2009, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1),})) + + tests.append((YearBegin(0), + {datetime(2008, 1, 1): datetime(2008, 1, 1), + datetime(2008, 6, 30): datetime(2009, 1, 1), + datetime(2008, 12, 31): datetime(2009, 1, 1), + datetime(2005, 12, 30): datetime(2006, 1, 1), + datetime(2005, 12, 31): datetime(2006, 1, 1),})) + + + tests.append((YearBegin(-1), + {datetime(2007, 1, 1): datetime(2006, 1, 1), + datetime(2008, 6, 30): datetime(2008, 1, 1), + datetime(2008, 12, 31): datetime(2008, 1, 1), + datetime(2006, 12, 29): datetime(2006, 1, 1), + datetime(2006, 12, 30): datetime(2006, 1, 1), + datetime(2007, 1, 1): datetime(2006, 1, 1),})) + + tests.append((YearBegin(-2), + {datetime(2007, 1, 1): datetime(2005, 1, 1), + datetime(2008, 6, 30): datetime(2007, 1, 1), + datetime(2008, 12, 31): datetime(2007, 1, 1),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + + def test_onOffset(self): + + tests = [ + (YearBegin(), datetime(2007, 1, 3), False), + (YearBegin(), datetime(2008, 1, 1), True), + (YearBegin(), datetime(2006, 12, 31), False), + (YearBegin(), datetime(2006, 1, 2), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestBYearEndLagged(unittest.TestCase): + + def test_bad_month_fail(self): + self.assertRaises(Exception, BYearEnd, month=13) + self.assertRaises(Exception, BYearEnd, month=0) + + def test_offset(self): + tests = [] + + tests.append((BYearEnd(month=6), + {datetime(2008, 1, 1): datetime(2008, 6, 30), + datetime(2007, 6, 30): datetime(2008, 6, 30)}, + )) + + tests.append((BYearEnd(n=-1, month=6), + {datetime(2008, 1, 1): datetime(2007, 6, 29), + datetime(2007, 6, 30): datetime(2007, 6, 29)}, + )) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + self.assertEqual(base + offset, expected) + + def test_roll(self): + offset = BYearEnd(month=6) + date = datetime(2009, 11, 30) + + self.assertEqual(offset.rollforward(date), datetime(2010, 6, 30)) + self.assertEqual(offset.rollback(date), datetime(2009, 6, 30)) + + def test_onOffset(self): + + tests = [ + (BYearEnd(month=2), datetime(2007, 2, 28), True), + (BYearEnd(month=6), datetime(2007, 6, 30), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestBYearEnd(unittest.TestCase): + + def test_offset(self): + tests = [] + + tests.append((BYearEnd(), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2006, 12, 29), + datetime(2005, 12, 31): datetime(2006, 12, 29),})) + + tests.append((BYearEnd(0), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 29),})) + + tests.append((BYearEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 29), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 30), + datetime(2006, 12, 30): datetime(2006, 12, 29), + datetime(2007, 1, 1): datetime(2006, 12, 29),})) + + tests.append((BYearEnd(-2), + {datetime(2007, 1, 1): datetime(2005, 12, 30), + datetime(2008, 6, 30): datetime(2006, 12, 29), + datetime(2008, 12, 31): datetime(2006, 12, 29),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (BYearEnd(), datetime(2007, 12, 31), True), + (BYearEnd(), datetime(2008, 1, 1), False), + (BYearEnd(), datetime(2006, 12, 31), False), + (BYearEnd(), datetime(2006, 12, 29), True), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestYearEnd(unittest.TestCase): + + def test_misspecified(self): + self.assertRaises(ValueError, YearEnd, month=13) + + def test_offset(self): + tests = [] + + tests.append((YearEnd(), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2009, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31), + datetime(2005, 12, 31): datetime(2006, 12, 31),})) + + tests.append((YearEnd(0), + {datetime(2008, 1, 1): datetime(2008, 12, 31), + datetime(2008, 6, 30): datetime(2008, 12, 31), + datetime(2008, 12, 31): datetime(2008, 12, 31), + datetime(2005, 12, 30): datetime(2005, 12, 31),})) + + tests.append((YearEnd(-1), + {datetime(2007, 1, 1): datetime(2006, 12, 31), + datetime(2008, 6, 30): datetime(2007, 12, 31), + datetime(2008, 12, 31): datetime(2007, 12, 31), + datetime(2006, 12, 29): datetime(2005, 12, 31), + datetime(2006, 12, 30): datetime(2005, 12, 31), + datetime(2007, 1, 1): datetime(2006, 12, 31),})) + + tests.append((YearEnd(-2), + {datetime(2007, 1, 1): datetime(2005, 12, 31), + datetime(2008, 6, 30): datetime(2006, 12, 31), + datetime(2008, 12, 31): datetime(2006, 12, 31),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (YearEnd(), datetime(2007, 12, 31), True), + (YearEnd(), datetime(2008, 1, 1), False), + (YearEnd(), datetime(2006, 12, 31), True), + (YearEnd(), datetime(2006, 12, 29), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +class TestYearEndDiffMonth(unittest.TestCase): + + def test_offset(self): + tests = [] + + tests.append((YearEnd(month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 15): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2009, 3, 31), + datetime(2008, 3, 30): datetime(2008, 3, 31), + datetime(2005, 3, 31): datetime(2006, 3, 31), + datetime(2006, 7, 30): datetime(2007, 3, 31)})) + + tests.append((YearEnd(0, month=3), + {datetime(2008, 1, 1): datetime(2008, 3, 31), + datetime(2008, 2, 28): datetime(2008, 3, 31), + datetime(2008, 3, 31): datetime(2008, 3, 31), + datetime(2005, 3, 30): datetime(2005, 3, 31),})) + + tests.append((YearEnd(-1, month=3), + {datetime(2007, 1, 1): datetime(2006, 3, 31), + datetime(2008, 2, 28): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2007, 3, 31), + datetime(2006, 3, 29): datetime(2005, 3, 31), + datetime(2006, 3, 30): datetime(2005, 3, 31), + datetime(2007, 3, 1): datetime(2006, 3, 31),})) + + tests.append((YearEnd(-2, month=3), + {datetime(2007, 1, 1): datetime(2005, 3, 31), + datetime(2008, 6, 30): datetime(2007, 3, 31), + datetime(2008, 3, 31): datetime(2006, 3, 31),})) + + for offset, cases in tests: + for base, expected in cases.iteritems(): + assertEq(offset, base, expected) + + def test_onOffset(self): + + tests = [ + (YearEnd(month=3), datetime(2007, 3, 31), True), + (YearEnd(month=3), datetime(2008, 1, 1), False), + (YearEnd(month=3), datetime(2006, 3, 31), True), + (YearEnd(month=3), datetime(2006, 3, 29), False), + ] + + for offset, date, expected in tests: + assertOnOffset(offset, date, expected) + +def assertEq(offset, base, expected): + actual = offset + base + try: + assert actual == expected + except AssertionError: + raise AssertionError("\nExpected: %s\nActual: %s\nFor Offset: %s)" + "\nAt Date: %s"% + (expected, actual, offset, base)) + +def test_Hour(): + assertEq(Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 1)) + assertEq(Hour(-1), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + assertEq(2 * Hour(), datetime(2010, 1, 1), datetime(2010, 1, 1, 2)) + assertEq(-1 * Hour(), datetime(2010, 1, 1, 1), datetime(2010, 1, 1)) + + assert (Hour(3) + Hour(2)) == Hour(5) + assert (Hour(3) - Hour(2)) == Hour() + + assert(Hour(4) != Hour(1)) + +def test_Minute(): + assertEq(Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 1)) + assertEq(Minute(-1), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + assertEq(2 * Minute(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 2)) + assertEq(-1 * Minute(), datetime(2010, 1, 1, 0, 1), datetime(2010, 1, 1)) + + assert (Minute(3) + Minute(2)) == Minute(5) + assert (Minute(3) - Minute(2)) == Minute() + assert(Minute(5) != Minute()) + +def test_Second(): + assertEq(Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 1)) + assertEq(Second(-1), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + assertEq(2 * Second(), datetime(2010, 1, 1), datetime(2010, 1, 1, 0, 0, 2)) + assertEq(-1 * Second(), datetime(2010, 1, 1, 0, 0, 1), datetime(2010, 1, 1)) + + assert (Second(3) + Second(2)) == Second(5) + assert (Second(3) - Second(2)) == Second() + +def test_hasOffsetName(): + assert hasOffsetName(BDay()) + assert not hasOffsetName(BDay(2)) + +def test_get_offset_name(): + assert_raises(Exception, get_offset_name, BDay(2)) + + assert get_offset_name(BDay()) == 'B' + assert get_offset_name(BMonthEnd()) == 'BM' + assert get_offset_name(Week(weekday=0)) == 'W-MON' + assert get_offset_name(Week(weekday=1)) =='W-TUE' + assert get_offset_name(Week(weekday=2)) == 'W-WED' + assert get_offset_name(Week(weekday=3)) == 'W-THU' + assert get_offset_name(Week(weekday=4)) == 'W-FRI' + + +def test_get_offset(): + assert_raises(Exception, get_offset, 'gibberish') + + assert get_offset('B') == BDay() + assert get_offset('b') == BDay() + assert get_offset('bm') == BMonthEnd() + assert get_offset('Bm') == BMonthEnd() + assert get_offset('W-MON') == Week(weekday=0) + assert get_offset('W-TUE') == Week(weekday=1) + assert get_offset('W-WED') == Week(weekday=2) + assert get_offset('W-THU') == Week(weekday=3) + assert get_offset('W-FRI') == Week(weekday=4) + assert get_offset('w@Sat') == Week(weekday=5) + +def test_parse_time_string(): + (date, parsed, reso) = parse_time_string('4Q1984') + (date_lower, parsed_lower, reso_lower) = parse_time_string('4q1984') + assert date == date_lower + assert parsed == parsed_lower + assert reso == reso_lower + +def test_get_standard_freq(): + fstr = get_standard_freq('W') + assert fstr == get_standard_freq('w') + assert fstr == get_standard_freq('1w') + assert fstr == get_standard_freq(('W', 1)) + assert fstr == get_standard_freq('WeEk') + + fstr = get_standard_freq('5Q') + assert fstr == get_standard_freq('5q') + assert fstr == get_standard_freq('5QuarTer') + assert fstr == get_standard_freq(('q', 5)) + +def test_quarterly_dont_normalize(): + date = datetime(2012, 3, 31, 5, 30) + + offsets = (QuarterBegin, QuarterEnd, BQuarterEnd, BQuarterBegin) + + for klass in offsets: + result = date + klass() + assert(result.time() == date.time()) + + +class TestOffsetAliases(unittest.TestCase): + + def setUp(self): + pass + + def test_alias_equality(self): + from pandas.tseries.frequencies import _offset_map + + for k, v in _offset_map.iteritems(): + if v is None: + continue + self.assertEqual(k, v.copy()) + + def test_rule_code(self): + lst = ['M', 'MS', 'BM', 'BMS', 'D', 'B', 'H', 'T', 'S', 'L', 'U'] + for k in lst: + assert k == _offset_map[k].rule_code + assert k == (_offset_map[k] * 3).rule_code + + suffix_lst = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] + base = 'W' + for v in suffix_lst: + alias = '-'.join([base, v]) + assert alias == _offset_map[alias].rule_code + assert alias == (_offset_map[alias] * 5).rule_code + + suffix_lst = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', + 'SEP', 'OCT', 'NOV', 'DEC'] + base_lst = ['A', 'AS', 'BA', 'BAS', 'Q', 'QS', 'BQ', 'BQS'] + for base in base_lst: + for v in suffix_lst: + alias = '-'.join([base, v]) + assert alias == _offset_map[alias].rule_code + assert alias == (_offset_map[alias] * 5).rule_code + +def test_apply_ticks(): + result = offsets.Hour(3).apply(offsets.Hour(4)) + exp = offsets.Hour(7) + assert(result == exp) + +def test_delta_to_tick(): + delta = timedelta(3) + + tick = offsets._delta_to_tick(delta) + assert(tick == offsets.Day(3)) + +def test_dateoffset_misc(): + oset = offsets.DateOffset(months=2, days=4) + # it works + result = oset.freqstr + + assert(not offsets.DateOffset(months=2) == 2) + +def test_freq_offsets(): + off = BDay(1, offset=timedelta(0, 1800)) + assert(off.freqstr == 'B+30Min') + + off = BDay(1, offset=timedelta(0, -1800)) + assert(off.freqstr == 'B-30Min') + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py new file mode 100644 index 00000000..fb09fec8 --- /dev/null +++ b/pandas/tseries/tests/test_period.py @@ -0,0 +1,1821 @@ +"""Tests suite for Period handling. + +Parts derived from scikits.timeseries code, original authors: +- Pierre Gerard-Marchant & Matt Knox +- pierregm_at_uga_dot_edu - mattknow_ca_at_hotmail_dot_com + +""" + +from unittest import TestCase +from datetime import datetime, date, timedelta +import unittest + +from numpy.ma.testutils import assert_equal + +from pandas.tseries.frequencies import MONTHS, DAYS +from pandas.tseries.period import Period, PeriodIndex, period_range +from pandas.tseries.index import DatetimeIndex, date_range +from pandas.tseries.tools import to_datetime +import pandas.tseries.period as pmod + +import pandas.core.datetools as datetools +import numpy as np +randn = np.random.randn + +from pandas import Series, TimeSeries, DataFrame +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +class TestPeriodProperties(TestCase): + "Test properties such as year, month, weekday, etc...." + # + def __init__(self, *args, **kwds): + TestCase.__init__(self, *args, **kwds) + + def test_quarterly_negative_ordinals(self): + p = Period(ordinal=-1, freq='Q-DEC') + self.assertEquals(p.year, 1969) + self.assertEquals(p.quarter, 4) + + p = Period(ordinal=-2, freq='Q-DEC') + self.assertEquals(p.year, 1969) + self.assertEquals(p.quarter, 3) + + def test_period_cons_quarterly(self): + # bugs in scikits.timeseries + for month in MONTHS: + freq = 'Q-%s' % month + exp = Period('1989Q3', freq=freq) + self.assert_('1989Q3' in str(exp)) + stamp = exp.to_timestamp('D', how='end') + p = Period(stamp, freq=freq) + self.assertEquals(p, exp) + + def test_period_cons_annual(self): + # bugs in scikits.timeseries + for month in MONTHS: + freq = 'A-%s' % month + exp = Period('1989', freq=freq) + stamp = exp.to_timestamp('D', how='end') + 30 + p = Period(stamp, freq=freq) + self.assertEquals(p, exp + 1) + + def test_period_cons_weekly(self): + for num in range(10, 17): + daystr = '2011-02-%d' % num + for day in DAYS: + freq = 'W-%s' % day + + result = Period(daystr, freq=freq) + expected = Period(daystr, freq='D').asfreq(freq) + self.assertEquals(result, expected) + + def test_period_constructor(self): + i1 = Period('1/1/2005', freq='M') + i2 = Period('Jan 2005') + + self.assertEquals(i1, i2) + + i1 = Period('2005', freq='A') + i2 = Period('2005') + i3 = Period('2005', freq='a') + + self.assertEquals(i1, i2) + self.assertEquals(i1, i3) + + i4 = Period('2005', freq='M') + i5 = Period('2005', freq='m') + + self.assert_(i1 != i4) + self.assertEquals(i4, i5) + + i1 = Period.now('Q') + i2 = Period(datetime.now(), freq='Q') + i3 = Period.now('q') + + self.assertEquals(i1, i2) + self.assertEquals(i1, i3) + + # Biz day construction, roll forward if non-weekday + i1 = Period('3/10/12', freq='B') + i2 = Period('3/12/12', freq='D') + self.assertEquals(i1, i2.asfreq('B')) + + i3 = Period('3/10/12', freq='b') + self.assertEquals(i1, i3) + + i1 = Period(year=2005, quarter=1, freq='Q') + i2 = Period('1/1/2005', freq='Q') + self.assertEquals(i1, i2) + + i1 = Period(year=2005, quarter=3, freq='Q') + i2 = Period('9/1/2005', freq='Q') + self.assertEquals(i1, i2) + + i1 = Period(year=2005, month=3, day=1, freq='D') + i2 = Period('3/1/2005', freq='D') + self.assertEquals(i1, i2) + + i3 = Period(year=2005, month=3, day=1, freq='d') + self.assertEquals(i1, i3) + + i1 = Period(year=2012, month=3, day=10, freq='B') + i2 = Period('3/12/12', freq='B') + self.assertEquals(i1, i2) + + i1 = Period('2005Q1') + i2 = Period(year=2005, quarter=1, freq='Q') + i3 = Period('2005q1') + self.assertEquals(i1, i2) + self.assertEquals(i1, i3) + + i1 = Period('05Q1') + self.assertEquals(i1, i2) + lower = Period('05q1') + self.assertEquals(i1, lower) + + i1 = Period('1Q2005') + self.assertEquals(i1, i2) + lower = Period('1q2005') + self.assertEquals(i1, lower) + + i1 = Period('1Q05') + self.assertEquals(i1, i2) + lower = Period('1q05') + self.assertEquals(i1, lower) + + i1 = Period('4Q1984') + self.assertEquals(i1.year, 1984) + lower = Period('4q1984') + self.assertEquals(i1, lower) + + i1 = Period('1982', freq='min') + i2 = Period('1982', freq='MIN') + self.assertEquals(i1, i2) + i2 = Period('1982', freq=('Min', 1)) + self.assertEquals(i1, i2) + + expected = Period('2007-01', freq='M') + i1 = Period('200701', freq='M') + self.assertEqual(i1, expected) + + i1 = Period('200701', freq='M') + self.assertEqual(i1, expected) + + i1 = Period(200701, freq='M') + self.assertEqual(i1, expected) + + i1 = Period(ordinal=200701, freq='M') + self.assertEqual(i1.year, 18695) + + i1 = Period(datetime(2007, 1, 1), freq='M') + i2 = Period('200701', freq='M') + self.assertEqual(i1, i2) + + i1 = Period(date(2007, 1, 1), freq='M') + i2 = Period(datetime(2007, 1, 1), freq='M') + self.assertEqual(i1, i2) + + self.assertRaises(ValueError, Period, ordinal=200701) + + def test_freq_str(self): + i1 = Period('1982', freq='Min') + self.assert_(i1.freq[0] != '1') + + def test_repr(self): + p = Period('Jan-2000') + self.assert_('Jan-2000' in repr(p)) + + def test_strftime(self): + p = Period('2000-1-1 12:34:12', freq='S') + self.assert_(p.strftime('%Y-%m-%d %H:%M:%S') == + '2000-01-01 12:34:12') + + def test_sub_delta(self): + left, right = Period('2011', freq='A'), Period('2007', freq='A') + result = left - right + self.assertEqual(result, 4) + + self.assertRaises(ValueError, left.__sub__, + Period('2007-01', freq='M')) + + def test_to_timestamp(self): + p = Period('1982', freq='A') + start_ts = p.to_timestamp(how='S') + aliases = ['s', 'StarT', 'BEGIn'] + for a in aliases: + self.assertEquals(start_ts, p.to_timestamp(how=a)) + + end_ts = p.to_timestamp(how='E') + aliases = ['e', 'end', 'FINIsH'] + for a in aliases: + self.assertEquals(end_ts, p.to_timestamp(how=a)) + + from_lst = ['A', 'Q', 'M', 'W', 'B', + 'D', 'H', 'Min', 'S'] + for i, fcode in enumerate(from_lst): + p = Period('1982', freq=fcode) + result = p.to_timestamp().to_period(fcode) + self.assertEquals(result, p) + + self.assertEquals(p.start_time, p.to_timestamp(how='S')) + + self.assertEquals(p.end_time, p.to_timestamp(how='E')) + + # Frequency other than daily + + p = Period('1985', freq='A') + + result = p.to_timestamp('H', how='end') + expected = datetime(1985, 12, 31, 23) + self.assertEquals(result, expected) + + result = p.to_timestamp('T', how='end') + expected = datetime(1985, 12, 31, 23, 59) + self.assertEquals(result, expected) + + result = p.to_timestamp('S', how='end') + expected = datetime(1985, 12, 31, 23, 59, 59) + self.assertEquals(result, expected) + + expected = datetime(1985, 1, 1) + result = p.to_timestamp('H', how='start') + self.assertEquals(result, expected) + result = p.to_timestamp('T', how='start') + self.assertEquals(result, expected) + result = p.to_timestamp('S', how='start') + self.assertEquals(result, expected) + + self.assertRaises(ValueError, p.to_timestamp, '5t') + + def test_properties_annually(self): + # Test properties on Periods with annually frequency. + a_date = Period(freq='A', year=2007) + assert_equal(a_date.year, 2007) + + def test_properties_quarterly(self): + # Test properties on Periods with daily frequency. + qedec_date = Period(freq="Q-DEC", year=2007, quarter=1) + qejan_date = Period(freq="Q-JAN", year=2007, quarter=1) + qejun_date = Period(freq="Q-JUN", year=2007, quarter=1) + # + for x in range(3): + for qd in (qedec_date, qejan_date, qejun_date): + assert_equal((qd + x).qyear, 2007) + assert_equal((qd + x).quarter, x + 1) + + + def test_properties_monthly(self): + # Test properties on Periods with daily frequency. + m_date = Period(freq='M', year=2007, month=1) + for x in range(11): + m_ival_x = m_date + x + assert_equal(m_ival_x.year, 2007) + if 1 <= x + 1 <= 3: + assert_equal(m_ival_x.quarter, 1) + elif 4 <= x + 1 <= 6: + assert_equal(m_ival_x.quarter, 2) + elif 7 <= x + 1 <= 9: + assert_equal(m_ival_x.quarter, 3) + elif 10 <= x + 1 <= 12: + assert_equal(m_ival_x.quarter, 4) + assert_equal(m_ival_x.month, x + 1) + + + def test_properties_weekly(self): + # Test properties on Periods with daily frequency. + w_date = Period(freq='WK', year=2007, month=1, day=7) + # + assert_equal(w_date.year, 2007) + assert_equal(w_date.quarter, 1) + assert_equal(w_date.month, 1) + assert_equal(w_date.week, 1) + assert_equal((w_date - 1).week, 52) + + + def test_properties_daily(self): + # Test properties on Periods with daily frequency. + b_date = Period(freq='B', year=2007, month=1, day=1) + # + assert_equal(b_date.year, 2007) + assert_equal(b_date.quarter, 1) + assert_equal(b_date.month, 1) + assert_equal(b_date.day, 1) + assert_equal(b_date.weekday, 0) + assert_equal(b_date.day_of_year, 1) + # + d_date = Period(freq='D', year=2007, month=1, day=1) + # + assert_equal(d_date.year, 2007) + assert_equal(d_date.quarter, 1) + assert_equal(d_date.month, 1) + assert_equal(d_date.day, 1) + assert_equal(d_date.weekday, 0) + assert_equal(d_date.day_of_year, 1) + + + def test_properties_hourly(self): + # Test properties on Periods with hourly frequency. + h_date = Period(freq='H', year=2007, month=1, day=1, hour=0) + # + assert_equal(h_date.year, 2007) + assert_equal(h_date.quarter, 1) + assert_equal(h_date.month, 1) + assert_equal(h_date.day, 1) + assert_equal(h_date.weekday, 0) + assert_equal(h_date.day_of_year, 1) + assert_equal(h_date.hour, 0) + # + + + def test_properties_minutely(self): + # Test properties on Periods with minutely frequency. + t_date = Period(freq='Min', year=2007, month=1, day=1, hour=0, + minute=0) + # + assert_equal(t_date.quarter, 1) + assert_equal(t_date.month, 1) + assert_equal(t_date.day, 1) + assert_equal(t_date.weekday, 0) + assert_equal(t_date.day_of_year, 1) + assert_equal(t_date.hour, 0) + assert_equal(t_date.minute, 0) + + + def test_properties_secondly(self): + # Test properties on Periods with secondly frequency. + s_date = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + # + assert_equal(s_date.year, 2007) + assert_equal(s_date.quarter, 1) + assert_equal(s_date.month, 1) + assert_equal(s_date.day, 1) + assert_equal(s_date.weekday, 0) + assert_equal(s_date.day_of_year, 1) + assert_equal(s_date.hour, 0) + assert_equal(s_date.minute, 0) + assert_equal(s_date.second, 0) + + def test_pnow(self): + dt = datetime.now() + + val = pmod.pnow('D') + exp = Period(dt, freq='D') + self.assertEquals(val, exp) + + def test_constructor_corner(self): + self.assertRaises(ValueError, Period, year=2007, month=1, + freq='2M') + + self.assertRaises(ValueError, Period, datetime.now()) + self.assertRaises(ValueError, Period, 1.6, freq='D') + self.assertRaises(ValueError, Period, ordinal=1.6, freq='D') + self.assertRaises(ValueError, Period, ordinal=2, value=1, freq='D') + self.assertRaises(ValueError, Period) + self.assertRaises(ValueError, Period, month=1) + + p = Period('2007-01-01', freq='D') + + result = Period(p, freq='A') + exp = Period('2007', freq='A') + self.assertEquals(result, exp) + + def test_constructor_infer_freq(self): + p = Period('2007-01-01') + self.assert_(p.freq == 'D') + + p = Period('2007-01-01 07') + self.assert_(p.freq == 'H') + + p = Period('2007-01-01 07:10') + self.assert_(p.freq == 'T') + + p = Period('2007-01-01 07:10:15') + self.assert_(p.freq == 'S') + + self.assertRaises(ValueError, Period, '2007-01-01 07:10:15.123456') + + def test_comparisons(self): + p = Period('2007-01-01') + self.assertEquals(p, p) + self.assert_(not p == 1) + +def noWrap(item): + return item + +class TestFreqConversion(TestCase): + "Test frequency conversion of date objects" + + def __init__(self, *args, **kwds): + TestCase.__init__(self, *args, **kwds) + + def test_asfreq_corner(self): + val = Period(freq='A', year=2007) + self.assertRaises(ValueError, val.asfreq, '5t') + + def test_conv_annual(self): + # frequency conversion tests: from Annual Frequency + + ival_A = Period(freq='A', year=2007) + + ival_AJAN = Period(freq="A-JAN", year=2007) + ival_AJUN = Period(freq="A-JUN", year=2007) + ival_ANOV = Period(freq="A-NOV", year=2007) + + ival_A_to_Q_start = Period(freq='Q', year=2007, quarter=1) + ival_A_to_Q_end = Period(freq='Q', year=2007, quarter=4) + ival_A_to_M_start = Period(freq='M', year=2007, month=1) + ival_A_to_M_end = Period(freq='M', year=2007, month=12) + ival_A_to_W_start = Period(freq='WK', year=2007, month=1, day=1) + ival_A_to_W_end = Period(freq='WK', year=2007, month=12, day=31) + ival_A_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_A_to_B_end = Period(freq='B', year=2007, month=12, day=31) + ival_A_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_A_to_D_end = Period(freq='D', year=2007, month=12, day=31) + ival_A_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_A_to_H_end = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_A_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_A_to_T_end = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_A_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_A_to_S_end = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + + ival_AJAN_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_AJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_AJUN_to_D_end = Period(freq='D', year=2007, month=6, day=30) + ival_AJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_ANOV_to_D_end = Period(freq='D', year=2007, month=11, day=30) + ival_ANOV_to_D_start = Period(freq='D', year=2006, month=12, day=1) + + assert_equal(ival_A.asfreq('Q', 'S'), ival_A_to_Q_start) + assert_equal(ival_A.asfreq('Q', 'e'), ival_A_to_Q_end) + assert_equal(ival_A.asfreq('M', 's'), ival_A_to_M_start) + assert_equal(ival_A.asfreq('M', 'E'), ival_A_to_M_end) + assert_equal(ival_A.asfreq('WK', 'S'), ival_A_to_W_start) + assert_equal(ival_A.asfreq('WK', 'E'), ival_A_to_W_end) + assert_equal(ival_A.asfreq('B', 'S'), ival_A_to_B_start) + assert_equal(ival_A.asfreq('B', 'E'), ival_A_to_B_end) + assert_equal(ival_A.asfreq('D', 'S'), ival_A_to_D_start) + assert_equal(ival_A.asfreq('D', 'E'), ival_A_to_D_end) + assert_equal(ival_A.asfreq('H', 'S'), ival_A_to_H_start) + assert_equal(ival_A.asfreq('H', 'E'), ival_A_to_H_end) + assert_equal(ival_A.asfreq('min', 'S'), ival_A_to_T_start) + assert_equal(ival_A.asfreq('min', 'E'), ival_A_to_T_end) + assert_equal(ival_A.asfreq('T', 'S'), ival_A_to_T_start) + assert_equal(ival_A.asfreq('T', 'E'), ival_A_to_T_end) + assert_equal(ival_A.asfreq('S', 'S'), ival_A_to_S_start) + assert_equal(ival_A.asfreq('S', 'E'), ival_A_to_S_end) + + assert_equal(ival_AJAN.asfreq('D', 'S'), ival_AJAN_to_D_start) + assert_equal(ival_AJAN.asfreq('D', 'E'), ival_AJAN_to_D_end) + + assert_equal(ival_AJUN.asfreq('D', 'S'), ival_AJUN_to_D_start) + assert_equal(ival_AJUN.asfreq('D', 'E'), ival_AJUN_to_D_end) + + assert_equal(ival_ANOV.asfreq('D', 'S'), ival_ANOV_to_D_start) + assert_equal(ival_ANOV.asfreq('D', 'E'), ival_ANOV_to_D_end) + + assert_equal(ival_A.asfreq('A'), ival_A) + + + def test_conv_quarterly(self): + # frequency conversion tests: from Quarterly Frequency + + ival_Q = Period(freq='Q', year=2007, quarter=1) + ival_Q_end_of_year = Period(freq='Q', year=2007, quarter=4) + + ival_QEJAN = Period(freq="Q-JAN", year=2007, quarter=1) + ival_QEJUN = Period(freq="Q-JUN", year=2007, quarter=1) + + ival_Q_to_A = Period(freq='A', year=2007) + ival_Q_to_M_start = Period(freq='M', year=2007, month=1) + ival_Q_to_M_end = Period(freq='M', year=2007, month=3) + ival_Q_to_W_start = Period(freq='WK', year=2007, month=1, day=1) + ival_Q_to_W_end = Period(freq='WK', year=2007, month=3, day=31) + ival_Q_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_Q_to_B_end = Period(freq='B', year=2007, month=3, day=30) + ival_Q_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_Q_to_D_end = Period(freq='D', year=2007, month=3, day=31) + ival_Q_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_Q_to_H_end = Period(freq='H', year=2007, month=3, day=31, + hour=23) + ival_Q_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_Q_to_T_end = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_Q_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_Q_to_S_end = Period(freq='S', year=2007, month=3, day=31, + hour=23, minute=59, second=59) + + ival_QEJAN_to_D_start = Period(freq='D', year=2006, month=2, day=1) + ival_QEJAN_to_D_end = Period(freq='D', year=2006, month=4, day=30) + + ival_QEJUN_to_D_start = Period(freq='D', year=2006, month=7, day=1) + ival_QEJUN_to_D_end = Period(freq='D', year=2006, month=9, day=30) + + assert_equal(ival_Q.asfreq('A'), ival_Q_to_A) + assert_equal(ival_Q_end_of_year.asfreq('A'), ival_Q_to_A) + + assert_equal(ival_Q.asfreq('M', 'S'), ival_Q_to_M_start) + assert_equal(ival_Q.asfreq('M', 'E'), ival_Q_to_M_end) + assert_equal(ival_Q.asfreq('WK', 'S'), ival_Q_to_W_start) + assert_equal(ival_Q.asfreq('WK', 'E'), ival_Q_to_W_end) + assert_equal(ival_Q.asfreq('B', 'S'), ival_Q_to_B_start) + assert_equal(ival_Q.asfreq('B', 'E'), ival_Q_to_B_end) + assert_equal(ival_Q.asfreq('D', 'S'), ival_Q_to_D_start) + assert_equal(ival_Q.asfreq('D', 'E'), ival_Q_to_D_end) + assert_equal(ival_Q.asfreq('H', 'S'), ival_Q_to_H_start) + assert_equal(ival_Q.asfreq('H', 'E'), ival_Q_to_H_end) + assert_equal(ival_Q.asfreq('Min', 'S'), ival_Q_to_T_start) + assert_equal(ival_Q.asfreq('Min', 'E'), ival_Q_to_T_end) + assert_equal(ival_Q.asfreq('S', 'S'), ival_Q_to_S_start) + assert_equal(ival_Q.asfreq('S', 'E'), ival_Q_to_S_end) + + assert_equal(ival_QEJAN.asfreq('D', 'S'), ival_QEJAN_to_D_start) + assert_equal(ival_QEJAN.asfreq('D', 'E'), ival_QEJAN_to_D_end) + assert_equal(ival_QEJUN.asfreq('D', 'S'), ival_QEJUN_to_D_start) + assert_equal(ival_QEJUN.asfreq('D', 'E'), ival_QEJUN_to_D_end) + + assert_equal(ival_Q.asfreq('Q'), ival_Q) + + def test_conv_monthly(self): + # frequency conversion tests: from Monthly Frequency + + ival_M = Period(freq='M', year=2007, month=1) + ival_M_end_of_year = Period(freq='M', year=2007, month=12) + ival_M_end_of_quarter = Period(freq='M', year=2007, month=3) + ival_M_to_A = Period(freq='A', year=2007) + ival_M_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_M_to_W_start = Period(freq='WK', year=2007, month=1, day=1) + ival_M_to_W_end = Period(freq='WK', year=2007, month=1, day=31) + ival_M_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_M_to_B_end = Period(freq='B', year=2007, month=1, day=31) + ival_M_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_M_to_D_end = Period(freq='D', year=2007, month=1, day=31) + ival_M_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_M_to_H_end = Period(freq='H', year=2007, month=1, day=31, + hour=23) + ival_M_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_M_to_T_end = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_M_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_M_to_S_end = Period(freq='S', year=2007, month=1, day=31, + hour=23, minute=59, second=59) + + assert_equal(ival_M.asfreq('A'), ival_M_to_A) + assert_equal(ival_M_end_of_year.asfreq('A'), ival_M_to_A) + assert_equal(ival_M.asfreq('Q'), ival_M_to_Q) + assert_equal(ival_M_end_of_quarter.asfreq('Q'), ival_M_to_Q) + + assert_equal(ival_M.asfreq('WK', 'S'), ival_M_to_W_start) + assert_equal(ival_M.asfreq('WK', 'E'), ival_M_to_W_end) + assert_equal(ival_M.asfreq('B', 'S'), ival_M_to_B_start) + assert_equal(ival_M.asfreq('B', 'E'), ival_M_to_B_end) + assert_equal(ival_M.asfreq('D', 'S'), ival_M_to_D_start) + assert_equal(ival_M.asfreq('D', 'E'), ival_M_to_D_end) + assert_equal(ival_M.asfreq('H', 'S'), ival_M_to_H_start) + assert_equal(ival_M.asfreq('H', 'E'), ival_M_to_H_end) + assert_equal(ival_M.asfreq('Min', 'S'), ival_M_to_T_start) + assert_equal(ival_M.asfreq('Min', 'E'), ival_M_to_T_end) + assert_equal(ival_M.asfreq('S', 'S'), ival_M_to_S_start) + assert_equal(ival_M.asfreq('S', 'E'), ival_M_to_S_end) + + assert_equal(ival_M.asfreq('M'), ival_M) + + + def test_conv_weekly(self): + # frequency conversion tests: from Weekly Frequency + + ival_W = Period(freq='WK', year=2007, month=1, day=1) + + ival_WSUN = Period(freq='WK', year=2007, month=1, day=7) + ival_WSAT = Period(freq='WK-SAT', year=2007, month=1, day=6) + ival_WFRI = Period(freq='WK-FRI', year=2007, month=1, day=5) + ival_WTHU = Period(freq='WK-THU', year=2007, month=1, day=4) + ival_WWED = Period(freq='WK-WED', year=2007, month=1, day=3) + ival_WTUE = Period(freq='WK-TUE', year=2007, month=1, day=2) + ival_WMON = Period(freq='WK-MON', year=2007, month=1, day=1) + + ival_WSUN_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_WSUN_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_WSAT_to_D_start = Period(freq='D', year=2006, month=12, day=31) + ival_WSAT_to_D_end = Period(freq='D', year=2007, month=1, day=6) + ival_WFRI_to_D_start = Period(freq='D', year=2006, month=12, day=30) + ival_WFRI_to_D_end = Period(freq='D', year=2007, month=1, day=5) + ival_WTHU_to_D_start = Period(freq='D', year=2006, month=12, day=29) + ival_WTHU_to_D_end = Period(freq='D', year=2007, month=1, day=4) + ival_WWED_to_D_start = Period(freq='D', year=2006, month=12, day=28) + ival_WWED_to_D_end = Period(freq='D', year=2007, month=1, day=3) + ival_WTUE_to_D_start = Period(freq='D', year=2006, month=12, day=27) + ival_WTUE_to_D_end = Period(freq='D', year=2007, month=1, day=2) + ival_WMON_to_D_start = Period(freq='D', year=2006, month=12, day=26) + ival_WMON_to_D_end = Period(freq='D', year=2007, month=1, day=1) + + ival_W_end_of_year = Period(freq='WK', year=2007, month=12, day=31) + ival_W_end_of_quarter = Period(freq='WK', year=2007, month=3, day=31) + ival_W_end_of_month = Period(freq='WK', year=2007, month=1, day=31) + ival_W_to_A = Period(freq='A', year=2007) + ival_W_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_W_to_M = Period(freq='M', year=2007, month=1) + + if Period(freq='D', year=2007, month=12, day=31).weekday == 6: + ival_W_to_A_end_of_year = Period(freq='A', year=2007) + else: + ival_W_to_A_end_of_year = Period(freq='A', year=2008) + + if Period(freq='D', year=2007, month=3, day=31).weekday == 6: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, + quarter=1) + else: + ival_W_to_Q_end_of_quarter = Period(freq='Q', year=2007, + quarter=2) + + if Period(freq='D', year=2007, month=1, day=31).weekday == 6: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=1) + else: + ival_W_to_M_end_of_month = Period(freq='M', year=2007, month=2) + + ival_W_to_B_start = Period(freq='B', year=2007, month=1, day=1) + ival_W_to_B_end = Period(freq='B', year=2007, month=1, day=5) + ival_W_to_D_start = Period(freq='D', year=2007, month=1, day=1) + ival_W_to_D_end = Period(freq='D', year=2007, month=1, day=7) + ival_W_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_W_to_H_end = Period(freq='H', year=2007, month=1, day=7, + hour=23) + ival_W_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_W_to_T_end = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_W_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_W_to_S_end = Period(freq='S', year=2007, month=1, day=7, + hour=23, minute=59, second=59) + + assert_equal(ival_W.asfreq('A'), ival_W_to_A) + assert_equal(ival_W_end_of_year.asfreq('A'), + ival_W_to_A_end_of_year) + assert_equal(ival_W.asfreq('Q'), ival_W_to_Q) + assert_equal(ival_W_end_of_quarter.asfreq('Q'), + ival_W_to_Q_end_of_quarter) + assert_equal(ival_W.asfreq('M'), ival_W_to_M) + assert_equal(ival_W_end_of_month.asfreq('M'), + ival_W_to_M_end_of_month) + + assert_equal(ival_W.asfreq('B', 'S'), ival_W_to_B_start) + assert_equal(ival_W.asfreq('B', 'E'), ival_W_to_B_end) + + assert_equal(ival_W.asfreq('D', 'S'), ival_W_to_D_start) + assert_equal(ival_W.asfreq('D', 'E'), ival_W_to_D_end) + + assert_equal(ival_WSUN.asfreq('D', 'S'), ival_WSUN_to_D_start) + assert_equal(ival_WSUN.asfreq('D', 'E'), ival_WSUN_to_D_end) + assert_equal(ival_WSAT.asfreq('D', 'S'), ival_WSAT_to_D_start) + assert_equal(ival_WSAT.asfreq('D', 'E'), ival_WSAT_to_D_end) + assert_equal(ival_WFRI.asfreq('D', 'S'), ival_WFRI_to_D_start) + assert_equal(ival_WFRI.asfreq('D', 'E'), ival_WFRI_to_D_end) + assert_equal(ival_WTHU.asfreq('D', 'S'), ival_WTHU_to_D_start) + assert_equal(ival_WTHU.asfreq('D', 'E'), ival_WTHU_to_D_end) + assert_equal(ival_WWED.asfreq('D', 'S'), ival_WWED_to_D_start) + assert_equal(ival_WWED.asfreq('D', 'E'), ival_WWED_to_D_end) + assert_equal(ival_WTUE.asfreq('D', 'S'), ival_WTUE_to_D_start) + assert_equal(ival_WTUE.asfreq('D', 'E'), ival_WTUE_to_D_end) + assert_equal(ival_WMON.asfreq('D', 'S'), ival_WMON_to_D_start) + assert_equal(ival_WMON.asfreq('D', 'E'), ival_WMON_to_D_end) + + assert_equal(ival_W.asfreq('H', 'S'), ival_W_to_H_start) + assert_equal(ival_W.asfreq('H', 'E'), ival_W_to_H_end) + assert_equal(ival_W.asfreq('Min', 'S'), ival_W_to_T_start) + assert_equal(ival_W.asfreq('Min', 'E'), ival_W_to_T_end) + assert_equal(ival_W.asfreq('S', 'S'), ival_W_to_S_start) + assert_equal(ival_W.asfreq('S', 'E'), ival_W_to_S_end) + + assert_equal(ival_W.asfreq('WK'), ival_W) + + + def test_conv_business(self): + # frequency conversion tests: from Business Frequency" + + ival_B = Period(freq='B', year=2007, month=1, day=1) + ival_B_end_of_year = Period(freq='B', year=2007, month=12, day=31) + ival_B_end_of_quarter = Period(freq='B', year=2007, month=3, day=30) + ival_B_end_of_month = Period(freq='B', year=2007, month=1, day=31) + ival_B_end_of_week = Period(freq='B', year=2007, month=1, day=5) + + ival_B_to_A = Period(freq='A', year=2007) + ival_B_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_B_to_M = Period(freq='M', year=2007, month=1) + ival_B_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_B_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_B_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_B_to_H_end = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_B_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_B_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_B_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_B_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + + assert_equal(ival_B.asfreq('A'), ival_B_to_A) + assert_equal(ival_B_end_of_year.asfreq('A'), ival_B_to_A) + assert_equal(ival_B.asfreq('Q'), ival_B_to_Q) + assert_equal(ival_B_end_of_quarter.asfreq('Q'), ival_B_to_Q) + assert_equal(ival_B.asfreq('M'), ival_B_to_M) + assert_equal(ival_B_end_of_month.asfreq('M'), ival_B_to_M) + assert_equal(ival_B.asfreq('WK'), ival_B_to_W) + assert_equal(ival_B_end_of_week.asfreq('WK'), ival_B_to_W) + + assert_equal(ival_B.asfreq('D'), ival_B_to_D) + + assert_equal(ival_B.asfreq('H', 'S'), ival_B_to_H_start) + assert_equal(ival_B.asfreq('H', 'E'), ival_B_to_H_end) + assert_equal(ival_B.asfreq('Min', 'S'), ival_B_to_T_start) + assert_equal(ival_B.asfreq('Min', 'E'), ival_B_to_T_end) + assert_equal(ival_B.asfreq('S', 'S'), ival_B_to_S_start) + assert_equal(ival_B.asfreq('S', 'E'), ival_B_to_S_end) + + assert_equal(ival_B.asfreq('B'), ival_B) + + + def test_conv_daily(self): + # frequency conversion tests: from Business Frequency" + + ival_D = Period(freq='D', year=2007, month=1, day=1) + ival_D_end_of_year = Period(freq='D', year=2007, month=12, day=31) + ival_D_end_of_quarter = Period(freq='D', year=2007, month=3, day=31) + ival_D_end_of_month = Period(freq='D', year=2007, month=1, day=31) + ival_D_end_of_week = Period(freq='D', year=2007, month=1, day=7) + + ival_D_friday = Period(freq='D', year=2007, month=1, day=5) + ival_D_saturday = Period(freq='D', year=2007, month=1, day=6) + ival_D_sunday = Period(freq='D', year=2007, month=1, day=7) + ival_D_monday = Period(freq='D', year=2007, month=1, day=8) + + ival_B_friday = Period(freq='B', year=2007, month=1, day=5) + ival_B_monday = Period(freq='B', year=2007, month=1, day=8) + + ival_D_to_A = Period(freq='A', year=2007) + + ival_Deoq_to_AJAN = Period(freq='A-JAN', year=2008) + ival_Deoq_to_AJUN = Period(freq='A-JUN', year=2007) + ival_Deoq_to_ADEC = Period(freq='A-DEC', year=2007) + + ival_D_to_QEJAN = Period(freq="Q-JAN", year=2007, quarter=4) + ival_D_to_QEJUN = Period(freq="Q-JUN", year=2007, quarter=3) + ival_D_to_QEDEC = Period(freq="Q-DEC", year=2007, quarter=1) + + ival_D_to_M = Period(freq='M', year=2007, month=1) + ival_D_to_W = Period(freq='WK', year=2007, month=1, day=7) + + ival_D_to_H_start = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_D_to_H_end = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_D_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_D_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_D_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_D_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + + assert_equal(ival_D.asfreq('A'), ival_D_to_A) + + assert_equal(ival_D_end_of_quarter.asfreq('A-JAN'), + ival_Deoq_to_AJAN) + assert_equal(ival_D_end_of_quarter.asfreq('A-JUN'), + ival_Deoq_to_AJUN) + assert_equal(ival_D_end_of_quarter.asfreq('A-DEC'), + ival_Deoq_to_ADEC) + + assert_equal(ival_D_end_of_year.asfreq('A'), ival_D_to_A) + assert_equal(ival_D_end_of_quarter.asfreq('Q'), ival_D_to_QEDEC) + assert_equal(ival_D.asfreq("Q-JAN"), ival_D_to_QEJAN) + assert_equal(ival_D.asfreq("Q-JUN"), ival_D_to_QEJUN) + assert_equal(ival_D.asfreq("Q-DEC"), ival_D_to_QEDEC) + assert_equal(ival_D.asfreq('M'), ival_D_to_M) + assert_equal(ival_D_end_of_month.asfreq('M'), ival_D_to_M) + assert_equal(ival_D.asfreq('WK'), ival_D_to_W) + assert_equal(ival_D_end_of_week.asfreq('WK'), ival_D_to_W) + + assert_equal(ival_D_friday.asfreq('B'), ival_B_friday) + assert_equal(ival_D_saturday.asfreq('B', 'S'), ival_B_friday) + assert_equal(ival_D_saturday.asfreq('B', 'E'), ival_B_monday) + assert_equal(ival_D_sunday.asfreq('B', 'S'), ival_B_friday) + assert_equal(ival_D_sunday.asfreq('B', 'E'), ival_B_monday) + + assert_equal(ival_D.asfreq('H', 'S'), ival_D_to_H_start) + assert_equal(ival_D.asfreq('H', 'E'), ival_D_to_H_end) + assert_equal(ival_D.asfreq('Min', 'S'), ival_D_to_T_start) + assert_equal(ival_D.asfreq('Min', 'E'), ival_D_to_T_end) + assert_equal(ival_D.asfreq('S', 'S'), ival_D_to_S_start) + assert_equal(ival_D.asfreq('S', 'E'), ival_D_to_S_end) + + assert_equal(ival_D.asfreq('D'), ival_D) + + def test_conv_hourly(self): + # frequency conversion tests: from Hourly Frequency" + + ival_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + ival_H_end_of_year = Period(freq='H', year=2007, month=12, day=31, + hour=23) + ival_H_end_of_quarter = Period(freq='H', year=2007, month=3, day=31, + hour=23) + ival_H_end_of_month = Period(freq='H', year=2007, month=1, day=31, + hour=23) + ival_H_end_of_week = Period(freq='H', year=2007, month=1, day=7, + hour=23) + ival_H_end_of_day = Period(freq='H', year=2007, month=1, day=1, + hour=23) + ival_H_end_of_bus = Period(freq='H', year=2007, month=1, day=1, + hour=23) + + ival_H_to_A = Period(freq='A', year=2007) + ival_H_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_H_to_M = Period(freq='M', year=2007, month=1) + ival_H_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_H_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_H_to_B = Period(freq='B', year=2007, month=1, day=1) + + ival_H_to_T_start = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_H_to_T_end = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=59) + ival_H_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_H_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=59, second=59) + + assert_equal(ival_H.asfreq('A'), ival_H_to_A) + assert_equal(ival_H_end_of_year.asfreq('A'), ival_H_to_A) + assert_equal(ival_H.asfreq('Q'), ival_H_to_Q) + assert_equal(ival_H_end_of_quarter.asfreq('Q'), ival_H_to_Q) + assert_equal(ival_H.asfreq('M'), ival_H_to_M) + assert_equal(ival_H_end_of_month.asfreq('M'), ival_H_to_M) + assert_equal(ival_H.asfreq('WK'), ival_H_to_W) + assert_equal(ival_H_end_of_week.asfreq('WK'), ival_H_to_W) + assert_equal(ival_H.asfreq('D'), ival_H_to_D) + assert_equal(ival_H_end_of_day.asfreq('D'), ival_H_to_D) + assert_equal(ival_H.asfreq('B'), ival_H_to_B) + assert_equal(ival_H_end_of_bus.asfreq('B'), ival_H_to_B) + + assert_equal(ival_H.asfreq('Min', 'S'), ival_H_to_T_start) + assert_equal(ival_H.asfreq('Min', 'E'), ival_H_to_T_end) + assert_equal(ival_H.asfreq('S', 'S'), ival_H_to_S_start) + assert_equal(ival_H.asfreq('S', 'E'), ival_H_to_S_end) + + assert_equal(ival_H.asfreq('H'), ival_H) + + def test_conv_minutely(self): + # frequency conversion tests: from Minutely Frequency" + + ival_T = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + ival_T_end_of_year = Period(freq='Min', year=2007, month=12, day=31, + hour=23, minute=59) + ival_T_end_of_quarter = Period(freq='Min', year=2007, month=3, day=31, + hour=23, minute=59) + ival_T_end_of_month = Period(freq='Min', year=2007, month=1, day=31, + hour=23, minute=59) + ival_T_end_of_week = Period(freq='Min', year=2007, month=1, day=7, + hour=23, minute=59) + ival_T_end_of_day = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_bus = Period(freq='Min', year=2007, month=1, day=1, + hour=23, minute=59) + ival_T_end_of_hour = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=59) + + ival_T_to_A = Period(freq='A', year=2007) + ival_T_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_T_to_M = Period(freq='M', year=2007, month=1) + ival_T_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_T_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_T_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_T_to_H = Period(freq='H', year=2007, month=1, day=1, hour=0) + + ival_T_to_S_start = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_T_to_S_end = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=59) + + assert_equal(ival_T.asfreq('A'), ival_T_to_A) + assert_equal(ival_T_end_of_year.asfreq('A'), ival_T_to_A) + assert_equal(ival_T.asfreq('Q'), ival_T_to_Q) + assert_equal(ival_T_end_of_quarter.asfreq('Q'), ival_T_to_Q) + assert_equal(ival_T.asfreq('M'), ival_T_to_M) + assert_equal(ival_T_end_of_month.asfreq('M'), ival_T_to_M) + assert_equal(ival_T.asfreq('WK'), ival_T_to_W) + assert_equal(ival_T_end_of_week.asfreq('WK'), ival_T_to_W) + assert_equal(ival_T.asfreq('D'), ival_T_to_D) + assert_equal(ival_T_end_of_day.asfreq('D'), ival_T_to_D) + assert_equal(ival_T.asfreq('B'), ival_T_to_B) + assert_equal(ival_T_end_of_bus.asfreq('B'), ival_T_to_B) + assert_equal(ival_T.asfreq('H'), ival_T_to_H) + assert_equal(ival_T_end_of_hour.asfreq('H'), ival_T_to_H) + + assert_equal(ival_T.asfreq('S', 'S'), ival_T_to_S_start) + assert_equal(ival_T.asfreq('S', 'E'), ival_T_to_S_end) + + assert_equal(ival_T.asfreq('Min'), ival_T) + + def test_conv_secondly(self): + # frequency conversion tests: from Secondly Frequency" + + ival_S = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=0) + ival_S_end_of_year = Period(freq='S', year=2007, month=12, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_quarter = Period(freq='S', year=2007, month=3, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_month = Period(freq='S', year=2007, month=1, day=31, + hour=23, minute=59, second=59) + ival_S_end_of_week = Period(freq='S', year=2007, month=1, day=7, + hour=23, minute=59, second=59) + ival_S_end_of_day = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_bus = Period(freq='S', year=2007, month=1, day=1, + hour=23, minute=59, second=59) + ival_S_end_of_hour = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=59, second=59) + ival_S_end_of_minute = Period(freq='S', year=2007, month=1, day=1, + hour=0, minute=0, second=59) + + ival_S_to_A = Period(freq='A', year=2007) + ival_S_to_Q = Period(freq='Q', year=2007, quarter=1) + ival_S_to_M = Period(freq='M', year=2007, month=1) + ival_S_to_W = Period(freq='WK', year=2007, month=1, day=7) + ival_S_to_D = Period(freq='D', year=2007, month=1, day=1) + ival_S_to_B = Period(freq='B', year=2007, month=1, day=1) + ival_S_to_H = Period(freq='H', year=2007, month=1, day=1, + hour=0) + ival_S_to_T = Period(freq='Min', year=2007, month=1, day=1, + hour=0, minute=0) + + assert_equal(ival_S.asfreq('A'), ival_S_to_A) + assert_equal(ival_S_end_of_year.asfreq('A'), ival_S_to_A) + assert_equal(ival_S.asfreq('Q'), ival_S_to_Q) + assert_equal(ival_S_end_of_quarter.asfreq('Q'), ival_S_to_Q) + assert_equal(ival_S.asfreq('M'), ival_S_to_M) + assert_equal(ival_S_end_of_month.asfreq('M'), ival_S_to_M) + assert_equal(ival_S.asfreq('WK'), ival_S_to_W) + assert_equal(ival_S_end_of_week.asfreq('WK'), ival_S_to_W) + assert_equal(ival_S.asfreq('D'), ival_S_to_D) + assert_equal(ival_S_end_of_day.asfreq('D'), ival_S_to_D) + assert_equal(ival_S.asfreq('B'), ival_S_to_B) + assert_equal(ival_S_end_of_bus.asfreq('B'), ival_S_to_B) + assert_equal(ival_S.asfreq('H'), ival_S_to_H) + assert_equal(ival_S_end_of_hour.asfreq('H'), ival_S_to_H) + assert_equal(ival_S.asfreq('Min'), ival_S_to_T) + assert_equal(ival_S_end_of_minute.asfreq('Min'), ival_S_to_T) + + assert_equal(ival_S.asfreq('S'), ival_S) + +class TestPeriodIndex(TestCase): + def __init__(self, *args, **kwds): + TestCase.__init__(self, *args, **kwds) + + def setUp(self): + pass + + def test_make_time_series(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + series = Series(1, index=index) + self.assert_(isinstance(series, TimeSeries)) + + def test_astype(self): + idx = period_range('1990', '2009', freq='A') + + result = idx.astype('i8') + self.assert_(np.array_equal(result, idx.values)) + + def test_constructor_use_start_freq(self): + # GH #1118 + p = Period('4/2/2012', freq='B') + index = PeriodIndex(start=p, periods=10) + expected = PeriodIndex(start='4/2/2012', periods=10, freq='B') + self.assert_(index.equals(expected)) + + def test_constructor_field_arrays(self): + # GH #1264 + + years = np.arange(1990, 2010).repeat(4)[2:-2] + quarters = np.tile(np.arange(1, 5), 20)[2:-2] + + index = PeriodIndex(year=years, quarter=quarters, freq='Q-DEC') + expected = period_range('1990Q3', '2009Q2', freq='Q-DEC') + self.assert_(index.equals(expected)) + + self.assertRaises(ValueError, PeriodIndex, year=years, quarter=quarters, + freq='2Q-DEC') + + index = PeriodIndex(year=years, quarter=quarters) + self.assert_(index.equals(expected)) + + years = [2007, 2007, 2007] + months = [1, 2] + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='M') + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='2M') + self.assertRaises(ValueError, PeriodIndex, year=years, month=months, + freq='M', start=Period('2007-01', freq='M')) + + years = [2007, 2007, 2007] + months = [1, 2, 3] + idx = PeriodIndex(year=years, month=months, freq='M') + exp = period_range('2007-01', periods=3, freq='M') + self.assert_(idx.equals(exp)) + + def test_constructor_arrays_negative_year(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(range(1, 5), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + self.assert_(np.array_equal(pindex.year, years)) + self.assert_(np.array_equal(pindex.quarter, quarters)) + + def test_constructor_invalid_quarters(self): + self.assertRaises(ValueError, PeriodIndex, year=range(2000, 2004), + quarter=range(4), freq='Q-DEC') + + def test_constructor_corner(self): + self.assertRaises(ValueError, PeriodIndex, periods=10, freq='A') + + start = Period('2007', freq='A-JUN') + end = Period('2010', freq='A-DEC') + self.assertRaises(ValueError, PeriodIndex, start=start, end=end) + self.assertRaises(ValueError, PeriodIndex, start=start) + self.assertRaises(ValueError, PeriodIndex, end=end) + + result = period_range('2007-01', periods=10.5, freq='M') + exp = period_range('2007-01', periods=10, freq='M') + self.assert_(result.equals(exp)) + + def test_constructor_fromarraylike(self): + idx = period_range('2007-01', periods=20, freq='M') + + self.assertRaises(ValueError, PeriodIndex, idx.values) + self.assertRaises(ValueError, PeriodIndex, list(idx.values)) + self.assertRaises(ValueError, PeriodIndex, + data=Period('2007', freq='A')) + + result = PeriodIndex(iter(idx)) + self.assert_(result.equals(idx)) + + result = PeriodIndex(idx) + self.assert_(result.equals(idx)) + + result = PeriodIndex(idx, freq='M') + self.assert_(result.equals(idx)) + + result = PeriodIndex(idx, freq='D') + exp = idx.asfreq('D', 'e') + self.assert_(result.equals(exp)) + + def test_constructor_datetime64arr(self): + vals = np.arange(100000, 100000 + 10000, 100, dtype=np.int64) + vals = vals.view(np.dtype('M8[us]')) + + self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + + def test_comp_period(self): + idx = period_range('2007-01', periods=20, freq='M') + + result = idx < idx[10] + exp = idx.values < idx.values[10] + self.assert_(np.array_equal(result, exp)) + + def test_getitem_ndim2(self): + idx = period_range('2007-01', periods=3, freq='M') + + result = idx[:, None] + # MPL kludge + self.assert_(type(result) == PeriodIndex) + + def test_getitem_partial(self): + rng = period_range('2007-01', periods=50) + ts = Series(np.random.randn(len(rng)), rng) + + self.assertRaises(KeyError, ts.__getitem__, '2006') + + def test_sub(self): + rng = period_range('2007-01', periods=50) + + result = rng - 5 + exp = rng + (-5) + self.assert_(result.equals(exp)) + + def test_periods_number_check(self): + self.assertRaises(ValueError, period_range, '2011-1-1', '2012-1-1', 'B') + + def test_to_timestamp(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + series = Series(1, index=index, name='foo') + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = series.to_timestamp('D', 'end') + self.assert_(result.index.equals(exp_index)) + self.assertEquals(result.name, 'foo') + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-DEC') + result = series.to_timestamp('D', 'start') + self.assert_(result.index.equals(exp_index)) + + + def _get_with_delta(delta, freq='A-DEC'): + return date_range(to_datetime('1/1/2001') + delta, + to_datetime('12/31/2009') + delta, freq=freq) + + delta = timedelta(hours=23) + result = series.to_timestamp('H', 'end') + exp_index = _get_with_delta(delta) + self.assert_(result.index.equals(exp_index)) + + delta = timedelta(hours=23, minutes=59) + result = series.to_timestamp('T', 'end') + exp_index = _get_with_delta(delta) + self.assert_(result.index.equals(exp_index)) + + result = series.to_timestamp('S', 'end') + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + self.assert_(result.index.equals(exp_index)) + + self.assertRaises(ValueError, index.to_timestamp, '5t') + + def test_to_timestamp_quarterly_bug(self): + years = np.arange(1960, 2000).repeat(4) + quarters = np.tile(range(1, 5), 40) + + pindex = PeriodIndex(year=years, quarter=quarters) + + stamps = pindex.to_timestamp('D', 'end') + expected = DatetimeIndex([x.to_timestamp('D', 'end') for x in pindex]) + self.assert_(stamps.equals(expected)) + + def test_to_timestamp_preserve_name(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009', + name='foo') + self.assertEquals(index.name, 'foo') + + conv = index.to_timestamp('D') + self.assertEquals(conv.name, 'foo') + + def test_as_frame_columns(self): + rng = period_range('1/1/2000', periods=5) + df = DataFrame(randn(10, 5), columns=rng) + + ts = df[rng[0]] + assert_series_equal(ts, df.ix[:, 0]) + + # GH # 1211 + repr(df) + + ts = df['1/1/2000'] + assert_series_equal(ts, df.ix[:, 0]) + + def test_nested_dict_frame_constructor(self): + rng = period_range('1/1/2000', periods=5) + df = DataFrame(randn(10, 5), columns=rng) + + data = {} + for col in df.columns: + for row in df.index: + data.setdefault(col, {})[row] = df.get_value(row, col) + + result = DataFrame(data, columns=rng) + tm.assert_frame_equal(result, df) + + data = {} + for col in df.columns: + for row in df.index: + data.setdefault(row, {})[col] = df.get_value(row, col) + + result = DataFrame(data, index=rng).T + tm.assert_frame_equal(result, df) + + def test_frame_to_time_stamp(self): + K = 5 + index = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + df = DataFrame(randn(len(index), K), index=index) + df['mix'] = 'a' + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = df.to_timestamp('D', 'end') + self.assert_(result.index.equals(exp_index)) + assert_almost_equal(result.values, df.values) + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-DEC') + result = df.to_timestamp('D', 'start') + self.assert_(result.index.equals(exp_index)) + + def _get_with_delta(delta, freq='A-DEC'): + return date_range(to_datetime('1/1/2001') + delta, + to_datetime('12/31/2009') + delta, freq=freq) + + delta = timedelta(hours=23) + result = df.to_timestamp('H', 'end') + exp_index = _get_with_delta(delta) + self.assert_(result.index.equals(exp_index)) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp('T', 'end') + exp_index = _get_with_delta(delta) + self.assert_(result.index.equals(exp_index)) + + result = df.to_timestamp('S', 'end') + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + self.assert_(result.index.equals(exp_index)) + + # columns + df = df.T + + exp_index = date_range('1/1/2001', end='12/31/2009', freq='A-DEC') + result = df.to_timestamp('D', 'end', axis=1) + self.assert_(result.columns.equals(exp_index)) + assert_almost_equal(result.values, df.values) + + exp_index = date_range('1/1/2001', end='1/1/2009', freq='AS-DEC') + result = df.to_timestamp('D', 'start', axis=1) + self.assert_(result.columns.equals(exp_index)) + + delta = timedelta(hours=23) + result = df.to_timestamp('H', 'end', axis=1) + exp_index = _get_with_delta(delta) + self.assert_(result.columns.equals(exp_index)) + + delta = timedelta(hours=23, minutes=59) + result = df.to_timestamp('T', 'end', axis=1) + exp_index = _get_with_delta(delta) + self.assert_(result.columns.equals(exp_index)) + + result = df.to_timestamp('S', 'end', axis=1) + delta = timedelta(hours=23, minutes=59, seconds=59) + exp_index = _get_with_delta(delta) + self.assert_(result.columns.equals(exp_index)) + + # invalid axis + self.assertRaises(ValueError, df.to_timestamp, axis=2) + + def test_index_duplicate_periods(self): + # monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq='A-JUN') + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts[2007] + expected = ts[1:3] + assert_series_equal(result, expected) + result[:] = 1 + self.assert_((ts[1:3] == 1).all()) + + # not monotonic + idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq='A-JUN') + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts[2007] + expected = ts[idx == 2007] + assert_series_equal(result, expected) + + def test_constructor(self): + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 9) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 4 * 9) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + assert_equal(len(pi), 12 * 9) + + pi = PeriodIndex(freq='D', start='1/1/2001', end='12/31/2009') + assert_equal(len(pi), 365 * 9 + 2) + + pi = PeriodIndex(freq='B', start='1/1/2001', end='12/31/2009') + assert_equal(len(pi), 261 * 9) + + pi = PeriodIndex(freq='H', start='1/1/2001', end='12/31/2001 23:00') + assert_equal(len(pi), 365 * 24) + + pi = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 23:59') + assert_equal(len(pi), 24 * 60) + + pi = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 23:59:59') + assert_equal(len(pi), 24 * 60 * 60) + + start = Period('02-Apr-2005', 'B') + i1 = PeriodIndex(start=start, periods=20) + assert_equal(len(i1), 20) + assert_equal(i1.freq, start.freq) + assert_equal(i1[0], start) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), 10) + assert_equal(i1.freq, end_intv.freq) + assert_equal(i1[-1], end_intv) + + end_intv = Period('2006-12-31', '1w') + i2 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), len(i2)) + self.assert_((i1 == i2).all()) + assert_equal(i1.freq, i2.freq) + + end_intv = Period('2006-12-31', ('w', 1)) + i2 = PeriodIndex(end=end_intv, periods=10) + assert_equal(len(i1), len(i2)) + self.assert_((i1 == i2).all()) + assert_equal(i1.freq, i2.freq) + + try: + PeriodIndex(start=start, end=end_intv) + raise AssertionError('Cannot allow mixed freq for start and end') + except ValueError: + pass + + end_intv = Period('2005-05-01', 'B') + i1 = PeriodIndex(start=start, end=end_intv) + + try: + PeriodIndex(start=start) + raise AssertionError('Must specify periods if missing start or end') + except ValueError: + pass + + # infer freq from first element + i2 = PeriodIndex([end_intv, Period('2005-05-05', 'B')]) + assert_equal(len(i2), 2) + assert_equal(i2[0], end_intv) + + i2 = PeriodIndex(np.array([end_intv, Period('2005-05-05', 'B')])) + assert_equal(len(i2), 2) + assert_equal(i2[0], end_intv) + + # Mixed freq should fail + vals = [end_intv, Period('2006-12-31', 'w')] + self.assertRaises(ValueError, PeriodIndex, vals) + vals = np.array(vals) + self.assertRaises(ValueError, PeriodIndex, vals) + + def test_shift(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2002', end='12/1/2010') + + self.assert_(pi1.shift(0).equals(pi1)) + + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='A', start='1/1/2000', end='12/1/2008') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='2/1/2001', end='1/1/2010') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='M', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='M', start='12/1/2000', end='11/1/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='1/2/2001', end='12/2/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(1).values, pi2.values) + + pi1 = PeriodIndex(freq='D', start='1/1/2001', end='12/1/2009') + pi2 = PeriodIndex(freq='D', start='12/31/2000', end='11/30/2009') + assert_equal(len(pi1), len(pi2)) + assert_equal(pi1.shift(-1).values, pi2.values) + + def test_asfreq(self): + pi1 = PeriodIndex(freq='A', start='1/1/2001', end='1/1/2001') + pi2 = PeriodIndex(freq='Q', start='1/1/2001', end='1/1/2001') + pi3 = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2001') + pi4 = PeriodIndex(freq='D', start='1/1/2001', end='1/1/2001') + pi5 = PeriodIndex(freq='H', start='1/1/2001', end='1/1/2001 00:00') + pi6 = PeriodIndex(freq='Min', start='1/1/2001', end='1/1/2001 00:00') + pi7 = PeriodIndex(freq='S', start='1/1/2001', end='1/1/2001 00:00:00') + + self.assertEquals(pi1.asfreq('Q', 'S'), pi2) + self.assertEquals(pi1.asfreq('Q', 's'), pi2) + self.assertEquals(pi1.asfreq('M', 'start'), pi3) + self.assertEquals(pi1.asfreq('D', 'StarT'), pi4) + self.assertEquals(pi1.asfreq('H', 'beGIN'), pi5) + self.assertEquals(pi1.asfreq('Min', 'S'), pi6) + self.assertEquals(pi1.asfreq('S', 'S'), pi7) + + self.assertEquals(pi2.asfreq('A', 'S'), pi1) + self.assertEquals(pi2.asfreq('M', 'S'), pi3) + self.assertEquals(pi2.asfreq('D', 'S'), pi4) + self.assertEquals(pi2.asfreq('H', 'S'), pi5) + self.assertEquals(pi2.asfreq('Min', 'S'), pi6) + self.assertEquals(pi2.asfreq('S', 'S'), pi7) + + self.assertEquals(pi3.asfreq('A', 'S'), pi1) + self.assertEquals(pi3.asfreq('Q', 'S'), pi2) + self.assertEquals(pi3.asfreq('D', 'S'), pi4) + self.assertEquals(pi3.asfreq('H', 'S'), pi5) + self.assertEquals(pi3.asfreq('Min', 'S'), pi6) + self.assertEquals(pi3.asfreq('S', 'S'), pi7) + + self.assertEquals(pi4.asfreq('A', 'S'), pi1) + self.assertEquals(pi4.asfreq('Q', 'S'), pi2) + self.assertEquals(pi4.asfreq('M', 'S'), pi3) + self.assertEquals(pi4.asfreq('H', 'S'), pi5) + self.assertEquals(pi4.asfreq('Min', 'S'), pi6) + self.assertEquals(pi4.asfreq('S', 'S'), pi7) + + self.assertEquals(pi5.asfreq('A', 'S'), pi1) + self.assertEquals(pi5.asfreq('Q', 'S'), pi2) + self.assertEquals(pi5.asfreq('M', 'S'), pi3) + self.assertEquals(pi5.asfreq('D', 'S'), pi4) + self.assertEquals(pi5.asfreq('Min', 'S'), pi6) + self.assertEquals(pi5.asfreq('S', 'S'), pi7) + + self.assertEquals(pi6.asfreq('A', 'S'), pi1) + self.assertEquals(pi6.asfreq('Q', 'S'), pi2) + self.assertEquals(pi6.asfreq('M', 'S'), pi3) + self.assertEquals(pi6.asfreq('D', 'S'), pi4) + self.assertEquals(pi6.asfreq('H', 'S'), pi5) + self.assertEquals(pi6.asfreq('S', 'S'), pi7) + + self.assertEquals(pi7.asfreq('A', 'S'), pi1) + self.assertEquals(pi7.asfreq('Q', 'S'), pi2) + self.assertEquals(pi7.asfreq('M', 'S'), pi3) + self.assertEquals(pi7.asfreq('D', 'S'), pi4) + self.assertEquals(pi7.asfreq('H', 'S'), pi5) + self.assertEquals(pi7.asfreq('Min', 'S'), pi6) + + self.assertRaises(ValueError, pi7.asfreq, 'T', 'foo') + self.assertRaises(ValueError, pi1.asfreq, '5t') + + def test_ts_repr(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/31/2010') + ts = Series(np.random.randn(len(index)), index=index) + repr(ts) + + def test_asfreq_ts(self): + index = PeriodIndex(freq='A', start='1/1/2001', end='12/31/2010') + ts = Series(np.random.randn(len(index)), index=index) + df = DataFrame(np.random.randn(len(index), 3), index=index) + + result = ts.asfreq('D', how='end') + df_result = df.asfreq('D', how='end') + exp_index = index.asfreq('D', how='end') + self.assert_(len(result) == len(ts)) + self.assert_(result.index.equals(exp_index)) + self.assert_(df_result.index.equals(exp_index)) + + result = ts.asfreq('D', how='start') + self.assert_(len(result) == len(ts)) + self.assert_(result.index.equals(index.asfreq('D', how='start'))) + + def test_badinput(self): + self.assertRaises(datetools.DateParseError, Period, '1/1/-2000', 'A') + # self.assertRaises(datetools.DateParseError, Period, '-2000', 'A') + # self.assertRaises(datetools.DateParseError, Period, '0', 'A') + + def test_negative_ordinals(self): + p = Period(ordinal=-1000, freq='A') + + p = Period(ordinal=0, freq='A') + + idx = PeriodIndex(ordinal=[-1, 0, 1], freq='A') + idx = PeriodIndex(ordinal=np.array([-1, 0, 1]), freq='A') + + def test_dti_to_period(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + pi1 = dti.to_period() + pi2 = dti.to_period(freq='D') + + self.assertEquals(pi1[0], Period('Jan 2005', freq='M')) + self.assertEquals(pi2[0], Period('1/31/2005', freq='D')) + + self.assertEquals(pi1[-1], Period('Nov 2005', freq='M')) + self.assertEquals(pi2[-1], Period('11/30/2005', freq='D')) + + def test_pindex_slice_index(self): + pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='M') + s = Series(np.random.rand(len(pi)), index=pi) + res = s['2010'] + exp = s[0:12] + assert_series_equal(res, exp) + res = s['2011'] + exp = s[12:24] + assert_series_equal(res, exp) + + def test_pindex_qaccess(self): + pi = PeriodIndex(['2Q05', '3Q05', '4Q05', '1Q06', '2Q06'], freq='Q') + s = Series(np.random.rand(len(pi)), index=pi).cumsum() + # Todo: fix these accessors! + self.assert_(s['05Q4'] == s[2]) + + def test_period_dt64_round_trip(self): + dti = date_range('1/1/2000', '1/7/2002', freq='B') + pi = dti.to_period() + self.assert_(pi.to_timestamp().equals(dti)) + + dti = date_range('1/1/2000', '1/7/2002', freq='B') + pi = dti.to_period(freq='H') + self.assert_(pi.to_timestamp().equals(dti)) + + def test_to_period_quarterly(self): + # make sure we can make the round trip + for month in MONTHS: + freq = 'Q-%s' % month + rng = period_range('1989Q3', '1991Q3', freq=freq) + stamps = rng.to_timestamp() + result = stamps.to_period(freq) + self.assert_(rng.equals(result)) + + def test_no_multiples(self): + self.assertRaises(ValueError, period_range, '1989Q3', periods=10, + freq='2Q') + + self.assertRaises(ValueError, period_range, '1989', periods=10, + freq='2A') + self.assertRaises(ValueError, Period, '1989', freq='2A') + + # def test_pindex_multiples(self): + # pi = PeriodIndex(start='1/1/10', end='12/31/12', freq='2M') + # self.assertEquals(pi[0], Period('1/1/10', '2M')) + # self.assertEquals(pi[1], Period('3/1/10', '2M')) + + # self.assertEquals(pi[0].asfreq('6M'), pi[2].asfreq('6M')) + # self.assertEquals(pi[0].asfreq('A'), pi[2].asfreq('A')) + + # self.assertEquals(pi[0].asfreq('M', how='S'), + # Period('Jan 2010', '1M')) + # self.assertEquals(pi[0].asfreq('M', how='E'), + # Period('Feb 2010', '1M')) + # self.assertEquals(pi[1].asfreq('M', how='S'), + # Period('Mar 2010', '1M')) + + # i = Period('1/1/2010 12:05:18', '5S') + # self.assertEquals(i, Period('1/1/2010 12:05:15', '5S')) + + # i = Period('1/1/2010 12:05:18', '5S') + # self.assertEquals(i.asfreq('1S', how='E'), + # Period('1/1/2010 12:05:19', '1S')) + + def test_iteration(self): + index = PeriodIndex(start='1/1/10', periods=4, freq='B') + + result = list(index) + self.assert_(isinstance(result[0], Period)) + self.assert_(result[0].freq == index.freq) + + def test_take(self): + index = PeriodIndex(start='1/1/10', end='12/31/12', freq='D') + + taken = index.take([5, 6, 8, 12]) + taken2 = index[[5, 6, 8, 12]] + self.assert_(isinstance(taken, PeriodIndex)) + self.assert_(taken.freq == index.freq) + self.assert_(isinstance(taken2, PeriodIndex)) + self.assert_(taken2.freq == index.freq) + + def test_joins(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + for kind in ['inner', 'outer', 'left', 'right']: + joined = index.join(index[:-5], how=kind) + + self.assert_(isinstance(joined, PeriodIndex)) + self.assert_(joined.freq == index.freq) + + def test_align_series(self): + rng = period_range('1/1/2000', '1/1/2010', freq='A') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected[1::2] = np.nan + assert_series_equal(result, expected) + + result = ts + _permute(ts[::2]) + assert_series_equal(result, expected) + + # it works! + for kind in ['inner', 'outer', 'left', 'right']: + ts.align(ts[::2], join=kind) + + self.assertRaises(Exception, ts.__add__, + ts.asfreq('D', how='end')) + + def test_align_frame(self): + rng = period_range('1/1/2000', '1/1/2010', freq='A') + ts = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts + ts[::2] + expected = ts + ts + expected.values[1::2] = np.nan + tm.assert_frame_equal(result, expected) + + result = ts + _permute(ts[::2]) + tm.assert_frame_equal(result, expected) + + def test_union(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + result = index[:-5].union(index[10:]) + self.assert_(result.equals(index)) + + # not in order + result = _permute(index[:-5]).union(_permute(index[10:])) + self.assert_(result.equals(index)) + + # raise if different frequencies + index = period_range('1/1/2000', '1/20/2000', freq='D') + index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + self.assertRaises(Exception, index.union, index2) + + self.assertRaises(ValueError, index.join, index.to_timestamp()) + + def test_intersection(self): + index = period_range('1/1/2000', '1/20/2000', freq='D') + + result = index[:-5].intersection(index[10:]) + self.assert_(result.equals(index[10:-5])) + + # not in order + left = _permute(index[:-5]) + right = _permute(index[10:]) + result = left.intersection(right).order() + self.assert_(result.equals(index[10:-5])) + + # raise if different frequencies + index = period_range('1/1/2000', '1/20/2000', freq='D') + index2 = period_range('1/1/2000', '1/20/2000', freq='W-WED') + self.assertRaises(Exception, index.intersection, index2) + + def test_fields(self): + # year, month, day, hour, minute + # second, weekofyear, week, dayofweek, weekday, dayofyear, quarter + # qyear + pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2005') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='Q', start='1/1/2001', end='12/1/2002') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='M', start='1/1/2001', end='1/1/2002') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='D', start='12/1/2001', end='6/1/2001') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='B', start='12/1/2001', end='6/1/2001') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='H', start='12/31/2001', end='1/1/2002 23:00') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='Min', start='12/31/2001', end='1/1/2002 00:20') + self._check_all_fields(pi) + + pi = PeriodIndex(freq='S', start='12/31/2001 00:00:00', + end='12/31/2001 00:05:00') + self._check_all_fields(pi) + + end_intv = Period('2006-12-31', 'W') + i1 = PeriodIndex(end=end_intv, periods=10) + self._check_all_fields(pi) + + def _check_all_fields(self, periodindex): + fields = ['year', 'month', 'day', 'hour', 'minute', + 'second', 'weekofyear', 'week', 'dayofweek', + 'weekday', 'dayofyear', 'quarter', 'qyear'] + + periods = list(periodindex) + + for field in fields: + field_idx = getattr(periodindex, field) + assert_equal(len(periodindex), len(field_idx)) + for x, val in zip(periods, field_idx): + assert_equal(getattr(x, field), val) + + def test_is_full(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + self.assert_(not index.is_full) + + index = PeriodIndex([2005, 2006, 2007], freq='A') + self.assert_(index.is_full) + + index = PeriodIndex([2005, 2005, 2007], freq='A') + self.assert_(not index.is_full) + + index = PeriodIndex([2005, 2005, 2006], freq='A') + self.assert_(index.is_full) + + index = PeriodIndex([2006, 2005, 2005], freq='A') + self.assertRaises(ValueError, getattr, index, 'is_full') + + self.assert_(index[:0].is_full) + + def test_map(self): + index = PeriodIndex([2005, 2007, 2009], freq='A') + result = index.map(lambda x: x + 1) + expected = index + 1 + self.assert_(result.equals(expected)) + + result = index.map(lambda x: x.ordinal) + exp = [x.ordinal for x in index] + self.assert_(np.array_equal(result, exp)) + +def _permute(obj): + return obj.take(np.random.permutation(len(obj))) + + +class TestMethods(TestCase): + "Base test class for MaskedArrays." + + def __init__(self, *args, **kwds): + TestCase.__init__(self, *args, **kwds) + + def test_add(self): + dt1 = Period(freq='D', year=2008, month=1, day=1) + dt2 = Period(freq='D', year=2008, month=1, day=2) + assert_equal(dt1 + 1, dt2) + # + self.assertRaises(TypeError, dt1.__add__, "str") + self.assertRaises(TypeError, dt1.__add__, dt2) + + +class TestPeriodRepresentation(unittest.TestCase): + """ + Wish to match NumPy units + """ + + def test_annual(self): + self._check_freq('A', 1970) + + def test_monthly(self): + self._check_freq('M', '1970-01') + + def test_weekly(self): + self._check_freq('W-THU', '1970-01-01') + + def test_daily(self): + self._check_freq('D', '1970-01-01') + + def test_business_daily(self): + self._check_freq('B', '1970-01-01') + + def test_hourly(self): + self._check_freq('H', '1970-01-01') + + def test_minutely(self): + self._check_freq('T', '1970-01-01') + + def test_secondly(self): + self._check_freq('S', '1970-01-01') + + def _check_freq(self, freq, base_date): + rng = PeriodIndex(start=base_date, periods=10, freq=freq) + exp = np.arange(10, dtype=np.int64) + self.assert_(np.array_equal(rng.values, exp)) + + def test_negone_ordinals(self): + freqs = ['A', 'M', 'Q', 'D','H', 'T', 'S'] + + period = Period(ordinal=-1, freq='D') + for freq in freqs: + repr(period.asfreq(freq)) + + for freq in freqs: + period = Period(ordinal=-1, freq=freq) + repr(period) + self.assertEquals(period.year, 1969) + + period = Period(ordinal=-1, freq='B') + repr(period) + period = Period(ordinal=-1, freq='W') + repr(period) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_plotting.py b/pandas/tseries/tests/test_plotting.py new file mode 100644 index 00000000..198e88c3 --- /dev/null +++ b/pandas/tseries/tests/test_plotting.py @@ -0,0 +1,665 @@ +import os +from datetime import datetime, timedelta, date, time + +import unittest +import nose + +import numpy as np +from numpy.testing.decorators import slow +from numpy.testing import assert_array_equal + +from pandas import Index, Series, DataFrame, isnull, notnull + +from pandas.tseries.index import date_range, bdate_range +from pandas.tseries.offsets import Minute, DateOffset +from pandas.tseries.period import period_range, Period +from pandas.tseries.resample import DatetimeIndex, TimeGrouper +import pandas.tseries.offsets as offsets +import pandas.tseries.frequencies as frequencies + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +class TestTSPlot(unittest.TestCase): + + @classmethod + def setUpClass(cls): + import sys + if 'IPython' in sys.modules: + raise nose.SkipTest + + try: + import matplotlib as mpl + mpl.use('Agg', warn=False) + except ImportError: + raise nose.SkipTest + + def setUp(self): + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q', 'Y'] + idx = [period_range('12/31/1999', freq=x, periods=100) for x in freq] + self.period_ser = [Series(np.random.randn(len(x)), x) for x in idx] + self.period_df = [DataFrame(np.random.randn(len(x), 3), index=x, + columns=['A', 'B', 'C']) + for x in idx] + + freq = ['S', 'T', 'H', 'D', 'W', 'M', 'Q-DEC', 'A', '1B30Min'] + idx = [date_range('12/31/1999', freq=x, periods=100) for x in freq] + self.datetime_ser = [Series(np.random.randn(len(x)), x) for x in idx] + self.datetime_df = [DataFrame(np.random.randn(len(x), 3), index=x, + columns=['A', 'B', 'C']) + for x in idx] + + @slow + def test_frame_inferred(self): + # inferred freq + import matplotlib.pyplot as plt + plt.close('all') + idx = date_range('1/1/1987', freq='MS', periods=100) + idx = DatetimeIndex(idx.values, freq=None) + df = DataFrame(np.random.randn(len(idx), 3), index=idx) + df.plot() + + # axes freq + idx = idx[0:40] + idx[45:99] + df2 = DataFrame(np.random.randn(len(idx), 3), index=idx) + df2.plot() + plt.close('all') + + @slow + def test_tsplot(self): + from pandas.tseries.plotting import tsplot + import matplotlib.pyplot as plt + plt.close('all') + + ax = plt.gca() + ts = tm.makeTimeSeries() + plot_ax = tsplot(ts, plt.Axes.plot) + self.assert_(plot_ax == ax) + + f = lambda *args, **kwds: tsplot(s, plt.Axes.plot, *args, **kwds) + plt.close('all') + + for s in self.period_ser: + _check_plot_works(f, s.index.freq, ax=ax, series=s) + plt.close('all') + for s in self.datetime_ser: + _check_plot_works(f, s.index.freq.rule_code, ax=ax, series=s) + plt.close('all') + + plt.close('all') + ax = ts.plot(style='k') + self.assert_((0., 0., 0.) == ax.get_lines()[0].get_color()) + + @slow + def test_high_freq(self): + freaks = ['ms', 'us'] + for freq in freaks: + rng = date_range('1/1/2012', periods=100000, freq=freq) + ser = Series(np.random.randn(len(rng)), rng) + _check_plot_works(ser.plot) + + def test_get_datevalue(self): + from pandas.tseries.plotting import get_datevalue + self.assert_(get_datevalue(None, 'D') is None) + self.assert_(get_datevalue(1987, 'A') == 1987) + self.assert_(get_datevalue(Period(1987, 'A'), 'M') == + Period('1987-12', 'M').ordinal) + self.assert_(get_datevalue('1/1/1987', 'D') == + Period('1987-1-1', 'D').ordinal) + + @slow + def test_line_plot_period_series(self): + for s in self.period_ser: + _check_plot_works(s.plot, s.index.freq) + + @slow + def test_line_plot_datetime_series(self): + for s in self.datetime_ser: + _check_plot_works(s.plot, s.index.freq.rule_code) + + @slow + def test_line_plot_period_frame(self): + for df in self.period_df: + _check_plot_works(df.plot, df.index.freq) + + @slow + def test_line_plot_datetime_frame(self): + for df in self.datetime_df: + freq = df.index.to_period(df.index.freq.rule_code).freq + _check_plot_works(df.plot, freq) + + @slow + def test_line_plot_inferred_freq(self): + for ser in self.datetime_ser: + ser = Series(ser.values, Index(np.asarray(ser.index))) + _check_plot_works(ser.plot, ser.index.inferred_freq) + + ser = ser[[0, 3, 5, 6]] + _check_plot_works(ser.plot) + + @slow + def test_plot_offset_freq(self): + ser = tm.makeTimeSeries() + _check_plot_works(ser.plot) + + dr = date_range(ser.index[0], freq='BQS', periods=10) + ser = Series(np.random.randn(len(dr)), dr) + _check_plot_works(ser.plot) + + @slow + def test_plot_multiple_inferred_freq(self): + dr = Index([datetime(2000, 1, 1), + datetime(2000, 1, 6), + datetime(2000, 1, 11)]) + ser = Series(np.random.randn(len(dr)), dr) + _check_plot_works(ser.plot) + + @slow + def test_irregular_datetime64_repr_bug(self): + import matplotlib.pyplot as plt + ser = tm.makeTimeSeries() + ser = ser[[0,1,2,7]] + + fig = plt.gcf() + plt.clf() + ax = fig.add_subplot(211) + ret = ser.plot() + assert(ret is not None) + + for rs, xp in zip(ax.get_lines()[0].get_xdata(), ser.index): + assert(rs == xp) + + @slow + def test_business_freq(self): + import matplotlib.pyplot as plt + plt.close('all') + bts = tm.makePeriodSeries() + ax = bts.plot() + self.assert_(ax.get_lines()[0].get_xydata()[0, 0], + bts.index[0].ordinal) + idx = ax.get_lines()[0].get_xdata() + self.assert_(idx.freqstr == 'B') + + @slow + def test_business_freq_convert(self): + import matplotlib.pyplot as plt + plt.close('all') + n = tm.N + tm.N = 300 + bts = tm.makeTimeSeries().asfreq('BM') + tm.N = n + ts = bts.to_period('M') + ax = bts.plot() + self.assert_(ax.get_lines()[0].get_xydata()[0, 0], ts.index[0].ordinal) + idx = ax.get_lines()[0].get_xdata() + self.assert_(idx.freqstr == 'M') + + @slow + def test_dataframe(self): + bts = DataFrame({'a': tm.makeTimeSeries()}) + ax = bts.plot() + idx = ax.get_lines()[0].get_xdata() + + @slow + def test_axis_limits(self): + def _test(ax): + xlim = ax.get_xlim() + ax.set_xlim(xlim[0] - 5, xlim[1] + 10) + ax.get_figure().canvas.draw() + result = ax.get_xlim() + self.assertEqual(result[0], xlim[0] - 5) + self.assertEqual(result[1], xlim[1] + 10) + + # string + expected = (Period('1/1/2000', ax.freq), + Period('4/1/2000', ax.freq)) + ax.set_xlim('1/1/2000', '4/1/2000') + ax.get_figure().canvas.draw() + result = ax.get_xlim() + self.assertEqual(int(result[0]), expected[0].ordinal) + self.assertEqual(int(result[1]), expected[1].ordinal) + + # datetim + expected = (Period('1/1/2000', ax.freq), + Period('4/1/2000', ax.freq)) + ax.set_xlim(datetime(2000, 1, 1), datetime(2000, 4, 1)) + ax.get_figure().canvas.draw() + result = ax.get_xlim() + self.assertEqual(int(result[0]), expected[0].ordinal) + self.assertEqual(int(result[1]), expected[1].ordinal) + + ser = tm.makeTimeSeries() + ax = ser.plot() + _test(ax) + + df = DataFrame({'a' : ser, 'b' : ser + 1}) + ax = df.plot() + _test(ax) + + df = DataFrame({'a' : ser, 'b' : ser + 1}) + axes = df.plot(subplots=True) + [_test(ax) for ax in axes] + + def test_get_finder(self): + import pandas.tseries.converter as conv + + self.assertEqual(conv.get_finder('B'), conv._daily_finder) + self.assertEqual(conv.get_finder('D'), conv._daily_finder) + self.assertEqual(conv.get_finder('M'), conv._monthly_finder) + self.assertEqual(conv.get_finder('Q'), conv._quarterly_finder) + self.assertEqual(conv.get_finder('A'), conv._annual_finder) + self.assertEqual(conv.get_finder('W'), conv._daily_finder) + + @slow + def test_finder_daily(self): + xp = Period('1999-1-1', freq='B').ordinal + day_lst = [10, 40, 252, 400, 950, 2750, 10000] + for n in day_lst: + rng = bdate_range('1999-1-1', periods=n) + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + (vmin, vmax) = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + + @slow + def test_finder_quarterly(self): + import matplotlib.pyplot as plt + xp = Period('1988Q1').ordinal + yrs = [3.5, 11] + plt.close('all') + for n in yrs: + rng = period_range('1987Q2', periods=int(n * 4), freq='Q') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assert_(rs == xp) + (vmin, vmax) = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + + @slow + def test_finder_monthly(self): + import matplotlib.pyplot as plt + xp = Period('1988-1').ordinal + yrs = [1.15, 2.5, 4, 11] + plt.close('all') + for n in yrs: + rng = period_range('1987Q2', periods=int(n * 12), freq='M') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assert_(rs == xp) + (vmin, vmax) = ax.get_xlim() + ax.set_xlim(vmin + 0.9, vmax) + rs = xaxis.get_majorticklocs()[0] + self.assertEqual(xp, rs) + plt.close('all') + + @slow + def test_finder_monthly_long(self): + import matplotlib.pyplot as plt + plt.close('all') + rng = period_range('1988Q1', periods=24*12, freq='M') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period('1989Q1', 'M').ordinal + self.assert_(rs == xp) + + @slow + def test_finder_annual(self): + import matplotlib.pyplot as plt + plt.close('all') + xp = [1987, 1988, 1990, 1990, 1995, 2020, 2070, 2170] + for i, nyears in enumerate([5, 10, 19, 49, 99, 199, 599, 1001]): + rng = period_range('1987', periods=nyears, freq='A') + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + self.assert_(rs == Period(xp[i], freq='A').ordinal) + plt.close('all') + + @slow + def test_finder_minutely(self): + import matplotlib.pyplot as plt + plt.close('all') + nminutes = 50 * 24 * 60 + rng = date_range('1/1/1999', freq='Min', periods=nminutes) + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period('1/1/1999', freq='Min').ordinal + self.assertEqual(rs, xp) + + @slow + def test_finder_hourly(self): + import matplotlib.pyplot as plt + plt.close('all') + nhours = 23 + rng = date_range('1/1/1999', freq='H', periods=nhours) + ser = Series(np.random.randn(len(rng)), rng) + ax = ser.plot() + xaxis = ax.get_xaxis() + rs = xaxis.get_majorticklocs()[0] + xp = Period('1/1/1999', freq='H').ordinal + self.assertEqual(rs, xp) + + @slow + def test_gaps(self): + import matplotlib.pyplot as plt + plt.close('all') + ts = tm.makeTimeSeries() + ts[5:25] = np.nan + ax = ts.plot() + lines = ax.get_lines() + self.assert_(len(lines) == 1) + l = lines[0] + data = l.get_xydata() + self.assert_(isinstance(data, np.ma.core.MaskedArray)) + mask = data.mask + self.assert_(mask[5:25, 1].all()) + + # irregular + plt.close('all') + ts = tm.makeTimeSeries() + ts = ts[[0, 1, 2, 5, 7, 9, 12, 15, 20]] + ts[2:5] = np.nan + ax = ts.plot() + lines = ax.get_lines() + self.assert_(len(lines) == 1) + l = lines[0] + data = l.get_xydata() + self.assert_(isinstance(data, np.ma.core.MaskedArray)) + mask = data.mask + self.assert_(mask[2:5, 1].all()) + + # non-ts + plt.close('all') + idx = [0, 1, 2, 5, 7, 9, 12, 15, 20] + ser = Series(np.random.randn(len(idx)), idx) + ser[2:5] = np.nan + ax = ser.plot() + lines = ax.get_lines() + self.assert_(len(lines) == 1) + l = lines[0] + data = l.get_xydata() + self.assert_(isinstance(data, np.ma.core.MaskedArray)) + mask = data.mask + self.assert_(mask[2:5, 1].all()) + + @slow + def test_secondary_y(self): + import matplotlib.pyplot as plt + plt.close('all') + ser = Series(np.random.randn(10)) + ser2 = Series(np.random.randn(10)) + ax = ser.plot(secondary_y=True) + fig = ax.get_figure() + axes = fig.get_axes() + l = ax.get_lines()[0] + xp = Series(l.get_ydata(), l.get_xdata()) + assert_series_equal(ser, xp) + self.assert_(ax.get_yaxis().get_ticks_position() == 'right') + self.assert_(not axes[0].get_yaxis().get_visible()) + + ax2 = ser2.plot() + self.assert_(ax2.get_yaxis().get_ticks_position() == 'left') + + plt.close('all') + ax = ser2.plot() + ax2 = ser.plot(secondary_y=True) + self.assert_(ax.get_yaxis().get_visible()) + + plt.close('all') + + @slow + def test_secondary_y_ts(self): + import matplotlib.pyplot as plt + plt.close('all') + idx = date_range('1/1/2000', periods=10) + ser = Series(np.random.randn(10), idx) + ser2 = Series(np.random.randn(10), idx) + ax = ser.plot(secondary_y=True) + fig = ax.get_figure() + axes = fig.get_axes() + l = ax.get_lines()[0] + xp = Series(l.get_ydata(), l.get_xdata()).to_timestamp() + assert_series_equal(ser, xp) + self.assert_(ax.get_yaxis().get_ticks_position() == 'right') + self.assert_(not axes[0].get_yaxis().get_visible()) + + ax2 = ser2.plot() + self.assert_(ax2.get_yaxis().get_ticks_position() == 'left') + + plt.close('all') + ax = ser2.plot() + ax2 = ser.plot(secondary_y=True) + self.assert_(ax.get_yaxis().get_visible()) + + @slow + def test_secondary_kde(self): + import matplotlib.pyplot as plt + plt.close('all') + ser = Series(np.random.randn(10)) + ax = ser.plot(secondary_y=True, kind='density') + fig = ax.get_figure() + axes = fig.get_axes() + self.assert_(axes[1].get_yaxis().get_ticks_position() == 'right') + + @slow + def test_secondary_bar(self): + import matplotlib.pyplot as plt + plt.close('all') + ser = Series(np.random.randn(10)) + ax = ser.plot(secondary_y=True, kind='bar') + fig = ax.get_figure() + axes = fig.get_axes() + self.assert_(axes[1].get_yaxis().get_ticks_position() == 'right') + + @slow + def test_secondary_frame(self): + import matplotlib.pyplot as plt + plt.close('all') + df = DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) + axes = df.plot(secondary_y=['a', 'c'], subplots=True) + self.assert_(axes[0].get_yaxis().get_ticks_position() == 'right') + self.assert_(axes[1].get_yaxis().get_ticks_position() == 'default') + self.assert_(axes[2].get_yaxis().get_ticks_position() == 'right') + + @slow + def test_mixed_freq_regular_first(self): + import matplotlib.pyplot as plt + plt.close('all') + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s1.plot() + ax2 = s2.plot(style='g') + lines = ax2.get_lines() + idx1 = lines[0].get_xdata() + idx2 = lines[1].get_xdata() + self.assert_(idx1.equals(s1.index.to_period('B'))) + self.assert_(idx2.equals(s2.index.to_period('B'))) + left, right = ax2.get_xlim() + pidx = s1.index.to_period() + self.assert_(left == pidx[0].ordinal) + self.assert_(right == pidx[-1].ordinal) + plt.close('all') + + @slow + def test_mixed_freq_irregular_first(self): + import matplotlib.pyplot as plt + plt.close('all') + s1 = tm.makeTimeSeries() + s2 = s1[[0, 5, 10, 11, 12, 13, 14, 15]] + s2.plot(style='g') + ax = s1.plot() + self.assert_(not hasattr(ax, 'freq')) + lines = ax.get_lines() + x1 = lines[0].get_xdata() + assert_array_equal(x1, s2.index.asobject.values) + x2 = lines[1].get_xdata() + assert_array_equal(x2, s1.index.asobject.values) + plt.close('all') + + @slow + def test_mixed_freq_hf_first(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'D') + + @slow + def test_mixed_freq_lf_first(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=365, freq='D') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'M') + + @slow + def test_mixed_freq_irreg_period(self): + ts = tm.makeTimeSeries() + irreg = ts[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 16, 17, 18, 29]] + rng = period_range('1/3/2000', periods=30, freq='B') + ps = Series(np.random.randn(len(rng)), rng) + irreg.plot() + ps.plot() + + @slow + def test_to_weekly_resampling(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + high.plot() + ax = low.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq.startswith('W')) + + @slow + def test_from_weekly_resampling(self): + import matplotlib.pyplot as plt + plt.close('all') + idxh = date_range('1/1/1999', periods=52, freq='W') + idxl = date_range('1/1/1999', periods=12, freq='M') + high = Series(np.random.randn(len(idxh)), idxh) + low = Series(np.random.randn(len(idxl)), idxl) + low.plot() + ax = high.plot() + for l in ax.get_lines(): + self.assert_(l.get_xdata().freq == 'M') + + @slow + def test_irreg_dtypes(self): + import matplotlib.pyplot as plt + #date + idx = [date(2000, 1, 1), date(2000, 1, 5), date(2000, 1, 20)] + df = DataFrame(np.random.randn(len(idx), 3), Index(idx, dtype=object)) + _check_plot_works(df.plot) + + #np.datetime64 + idx = date_range('1/1/2000', periods=10) + idx = idx[[0, 2, 5, 9]].asobject + df = DataFrame(np.random.randn(len(idx), 3), idx) + _check_plot_works(df.plot) + + @slow + def test_time(self): + import matplotlib.pyplot as plt + plt.close('all') + + t = datetime(1, 1, 1, 3, 30, 0) + deltas = np.random.randint(1, 20, 3).cumsum() + ts = np.array([(t + timedelta(minutes=int(x))).time() for x in deltas]) + df = DataFrame({'a' : np.random.randn(len(ts)), + 'b' : np.random.randn(len(ts))}, + index=ts) + ax = df.plot() + + # verify tick labels + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + xp = l.get_text() + if len(xp) > 0: + rs = time(h, m, s).strftime('%H:%M:%S') + self.assert_(xp, rs) + + # change xlim + ax.set_xlim('1:30', '5:00') + + # check tick labels again + ticks = ax.get_xticks() + labels = ax.get_xticklabels() + for t, l in zip(ticks, labels): + m, s = divmod(int(t), 60) + h, m = divmod(m, 60) + xp = l.get_text() + if len(xp) > 0: + rs = time(h, m, s).strftime('%H:%M:%S') + self.assert_(xp, rs) + +PNG_PATH = 'tmp.png' +def _check_plot_works(f, freq=None, series=None, *args, **kwargs): + import matplotlib.pyplot as plt + + fig = plt.gcf() + plt.clf() + ax = fig.add_subplot(211) + orig_ax = kwargs.pop('ax', plt.gca()) + orig_axfreq = getattr(orig_ax, 'freq', None) + + ret = f(*args, **kwargs) + assert(ret is not None) # do something more intelligent + + ax = kwargs.pop('ax', plt.gca()) + if series is not None: + dfreq = series.index.freq + if isinstance(dfreq, DateOffset): + dfreq = dfreq.rule_code + if orig_axfreq is None: + assert(ax.freq == dfreq) + + if freq is not None and orig_axfreq is None: + assert(ax.freq == freq) + + ax = fig.add_subplot(212) + try: + kwargs['ax'] = ax + ret = f(*args, **kwargs) + assert(ret is not None) # do something more intelligent + except Exception: + pass + plt.savefig(PNG_PATH) + os.remove(PNG_PATH) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tseries/tests/test_resample.py b/pandas/tseries/tests/test_resample.py new file mode 100644 index 00000000..15c82b8c --- /dev/null +++ b/pandas/tseries/tests/test_resample.py @@ -0,0 +1,792 @@ +from datetime import datetime, timedelta + +import numpy as np + +from pandas import Series, TimeSeries, DataFrame, Panel, isnull, notnull + +from pandas.tseries.index import date_range +from pandas.tseries.offsets import Minute, BDay +from pandas.tseries.period import period_range, PeriodIndex, Period +from pandas.tseries.resample import DatetimeIndex, TimeGrouper +import pandas.tseries.offsets as offsets +import pandas as pd + +import unittest +import nose + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +bday = BDay() + + +def _skip_if_no_pytz(): + try: + import pytz + except ImportError: + raise nose.SkipTest + + +class TestResample(unittest.TestCase): + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005,1,1), + end=datetime(2005,1,10), freq='Min') + + self.series = Series(np.random.rand(len(dti)), dti) + + def test_custom_grouper(self): + + dti = DatetimeIndex(freq='Min', start=datetime(2005,1,1), + end=datetime(2005,1,10)) + + data = np.array([1]*len(dti)) + s = Series(data, index=dti) + + b = TimeGrouper(Minute(5)) + g = s.groupby(b) + + # check all cython functions work + funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + for f in funcs: + g._cython_agg_general(f) + + b = TimeGrouper(Minute(5), closed='right', label='right') + g = s.groupby(b) + # check all cython functions work + funcs = ['add', 'mean', 'prod', 'ohlc', 'min', 'max', 'var'] + for f in funcs: + g._cython_agg_general(f) + + + self.assertEquals(g.ngroups, 2593) + self.assert_(notnull(g.mean()).all()) + + # construct expected val + arr = [1] + [5] * 2592 + idx = dti[0:-1:5] + idx = idx.append(dti[-1:]) + expect = Series(arr, index=idx) + + # cython returns float for now + result = g.agg(np.sum) + assert_series_equal(result, expect.astype(float)) + + data = np.random.rand(len(dti), 10) + df = DataFrame(data, index=dti) + r = df.groupby(b).agg(np.sum) + + self.assertEquals(len(r.columns), 10) + self.assertEquals(len(r.index), 2593) + + def test_resample_basic(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + s = Series(np.random.randn(14), index=rng) + result = s.resample('5min', how='mean', closed='right', label='right') + expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=date_range('1/1/2000', periods=4, freq='5min')) + assert_series_equal(result, expected) + + result = s.resample('5min', how='mean', closed='left', label='right') + expected = Series([s[:5].mean(), s[5:10].mean(), s[10:].mean()], + index=date_range('1/1/2000 00:05', periods=3, + freq='5min')) + assert_series_equal(result, expected) + + s = self.series + result = s.resample('5Min', how='last') + grouper = TimeGrouper(Minute(5), closed='right', label='right') + expect = s.groupby(grouper).agg(lambda x: x[-1]) + assert_series_equal(result, expect) + + # from daily + dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10), + freq='D') + + s = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = s.resample('w-sun', how='last') + + self.assertEquals(len(result), 3) + self.assert_((result.index.dayofweek == [6,6,6]).all()) + self.assertEquals(result.irow(0), s['1/2/2005']) + self.assertEquals(result.irow(1), s['1/9/2005']) + self.assertEquals(result.irow(2), s.irow(-1)) + + result = s.resample('W-MON', how='last') + self.assertEquals(len(result), 2) + self.assert_((result.index.dayofweek == [0,0]).all()) + self.assertEquals(result.irow(0), s['1/3/2005']) + self.assertEquals(result.irow(1), s['1/10/2005']) + + result = s.resample('W-TUE', how='last') + self.assertEquals(len(result), 2) + self.assert_((result.index.dayofweek == [1,1]).all()) + self.assertEquals(result.irow(0), s['1/4/2005']) + self.assertEquals(result.irow(1), s['1/10/2005']) + + result = s.resample('W-WED', how='last') + self.assertEquals(len(result), 2) + self.assert_((result.index.dayofweek == [2,2]).all()) + self.assertEquals(result.irow(0), s['1/5/2005']) + self.assertEquals(result.irow(1), s['1/10/2005']) + + result = s.resample('W-THU', how='last') + self.assertEquals(len(result), 2) + self.assert_((result.index.dayofweek == [3,3]).all()) + self.assertEquals(result.irow(0), s['1/6/2005']) + self.assertEquals(result.irow(1), s['1/10/2005']) + + result = s.resample('W-FRI', how='last') + self.assertEquals(len(result), 2) + self.assert_((result.index.dayofweek == [4,4]).all()) + self.assertEquals(result.irow(0), s['1/7/2005']) + self.assertEquals(result.irow(1), s['1/10/2005']) + + # to biz day + result = s.resample('B', how='last') + self.assertEquals(len(result), 6) + self.assert_((result.index.dayofweek == [0,1,2,3,4,0]).all()) + self.assertEquals(result.irow(0), s['1/3/2005']) + self.assertEquals(result.irow(1), s['1/4/2005']) + self.assertEquals(result.irow(5), s['1/10/2005']) + + def test_resample_frame_basic(self): + df = tm.makeTimeDataFrame() + + b = TimeGrouper('M') + g = df.groupby(b) + + # check all cython functions work + funcs = ['add', 'mean', 'prod', 'min', 'max', 'var'] + for f in funcs: + g._cython_agg_general(f) + + result = df.resample('A') + assert_series_equal(result['A'], df['A'].resample('A')) + + result = df.resample('M') + assert_series_equal(result['A'], df['A'].resample('M')) + + df.resample('M', kind='period') + df.resample('W-WED', kind='period') + + def test_resample_loffset(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 00:13:00', freq='min') + s = Series(np.random.randn(14), index=rng) + + result = s.resample('5min', how='mean', closed='right', label='right', + loffset=timedelta(minutes=1)) + idx = date_range('1/1/2000', periods=4, freq='5min') + expected = Series([s[0], s[1:6].mean(), s[6:11].mean(), s[11:].mean()], + index=idx + timedelta(minutes=1)) + assert_series_equal(result, expected) + + expected = s.resample('5min', how='mean', closed='right', label='right', + loffset='1min') + assert_series_equal(result, expected) + + expected = s.resample('5min', how='mean', closed='right', label='right', + loffset=Minute(1)) + assert_series_equal(result, expected) + + self.assert_(result.index.freq == Minute(5)) + + # from daily + dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10), + freq='D') + ser = Series(np.random.rand(len(dti)), dti) + + # to weekly + result = ser.resample('w-sun', how='last') + expected = ser.resample('w-sun', how='last', loffset=-bday) + self.assertEqual(result.index[0] - bday, expected.index[0]) + + def test_resample_upsample(self): + # from daily + dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10), + freq='D') + + s = Series(np.random.rand(len(dti)), dti) + + # to minutely, by padding + result = s.resample('Min', fill_method='pad') + self.assertEquals(len(result), 12961) + self.assertEquals(result[0], s[0]) + self.assertEquals(result[-1], s[-1]) + + def test_upsample_with_limit(self): + rng = date_range('1/1/2000', periods=3, freq='5t') + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample('t', fill_method='ffill', limit=2) + expected = ts.reindex(result.index, method='ffill', limit=2) + assert_series_equal(result, expected) + + def test_resample_ohlc(self): + s = self.series + + grouper = TimeGrouper(Minute(5), closed='right', label='right') + expect = s.groupby(grouper).agg(lambda x: x[-1]) + result = s.resample('5Min', how='ohlc') + + self.assertEquals(len(result), len(expect)) + self.assertEquals(len(result.columns), 4) + + xs = result.irow(-1) + self.assertEquals(xs['open'], s[-5]) + self.assertEquals(xs['high'], s[-5:].max()) + self.assertEquals(xs['low'], s[-5:].min()) + self.assertEquals(xs['close'], s[-1]) + + xs = result.irow(1) + self.assertEquals(xs['open'], s[1]) + self.assertEquals(xs['high'], s[1:6].max()) + self.assertEquals(xs['low'], s[1:6].min()) + self.assertEquals(xs['close'], s[5]) + + def test_resample_reresample(self): + dti = DatetimeIndex(start=datetime(2005,1,1), end=datetime(2005,1,10), + freq='D') + s = Series(np.random.rand(len(dti)), dti) + bs = s.resample('B') + result = bs.resample('8H') + self.assertEquals(len(result), 22) + self.assert_(isinstance(result.index.freq, offsets.DateOffset)) + self.assert_(result.index.freq == offsets.Hour(8)) + + def test_resample_timestamp_to_period(self): + ts = _simple_ts('1/1/1990', '1/1/2000') + + result = ts.resample('A-DEC', kind='period') + expected = ts.resample('A-DEC') + expected.index = period_range('1990', '2000', freq='a-dec') + assert_series_equal(result, expected) + + result = ts.resample('A-JUN', kind='period') + expected = ts.resample('A-JUN') + expected.index = period_range('1990', '2000', freq='a-jun') + assert_series_equal(result, expected) + + result = ts.resample('M', kind='period') + expected = ts.resample('M') + expected.index = period_range('1990-01', '2000-01', freq='M') + assert_series_equal(result, expected) + + result = ts.resample('M', kind='period') + expected = ts.resample('M') + expected.index = period_range('1990-01', '2000-01', freq='M') + assert_series_equal(result, expected) + + def test_ohlc_5min(self): + def _ohlc(group): + if isnull(group).all(): + return np.repeat(np.nan, 4) + return [group[0], group.max(), group.min(), group[-1]] + + rng = date_range('1/1/2000 00:00:00', '1/1/2000 5:59:50', + freq='10s') + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('5min', how='ohlc') + + self.assert_((resampled.ix['1/1/2000 00:00'] == ts[0]).all()) + + exp = _ohlc(ts[1:31]) + self.assert_((resampled.ix['1/1/2000 00:05'] == exp).all()) + + exp = _ohlc(ts['1/1/2000 5:55:01':]) + self.assert_((resampled.ix['1/1/2000 6:00:00'] == exp).all()) + + def test_downsample_non_unique(self): + rng = date_range('1/1/2000', '2/29/2000') + rng2 = rng.repeat(5).values + ts = Series(np.random.randn(len(rng2)), index=rng2) + + result = ts.resample('M', how='mean') + + expected = ts.groupby(lambda x: x.month).mean() + self.assertEquals(len(result), 2) + assert_almost_equal(result[0], expected[1]) + assert_almost_equal(result[1], expected[2]) + + def test_asfreq_non_unique(self): + # GH #1077 + rng = date_range('1/1/2000', '2/29/2000') + rng2 = rng.repeat(2).values + ts = Series(np.random.randn(len(rng2)), index=rng2) + + self.assertRaises(Exception, ts.asfreq, 'B') + + def test_resample_axis1(self): + rng = date_range('1/1/2000', '2/29/2000') + df = DataFrame(np.random.randn(3, len(rng)), columns=rng, + index=['a', 'b', 'c']) + + result = df.resample('M', axis=1) + expected = df.T.resample('M').T + tm.assert_frame_equal(result, expected) + + def test_resample_panel(self): + rng = date_range('1/1/2000', '6/30/2000') + n = len(rng) + + panel = Panel(np.random.randn(3, n, 5), + items=['one', 'two', 'three'], + major_axis=rng, + minor_axis=['a', 'b', 'c', 'd', 'e']) + + result = panel.resample('M', axis=1) + + def p_apply(panel, f): + result = {} + for item in panel.items: + result[item] = f(panel[item]) + return Panel(result, items=panel.items) + + expected = p_apply(panel, lambda x: x.resample('M')) + tm.assert_panel_equal(result, expected) + + panel2 = panel.swapaxes(1, 2) + result = panel2.resample('M', axis=2) + expected = p_apply(panel2, lambda x: x.resample('M', axis=1)) + tm.assert_panel_equal(result, expected) + + def test_resample_panel_numpy(self): + rng = date_range('1/1/2000', '6/30/2000') + n = len(rng) + + panel = Panel(np.random.randn(3, n, 5), + items=['one', 'two', 'three'], + major_axis=rng, + minor_axis=['a', 'b', 'c', 'd', 'e']) + + result = panel.resample('M', how=lambda x: x.mean(), axis=1) + expected = panel.resample('M', how='mean', axis=1) + tm.assert_panel_equal(result, expected) + + def test_resample_anchored_ticks(self): + # If a fixed delta (5 minute, 4 hour) evenly divides a day, we should + # "anchor" the origin at midnight so we get regular intervals rather + # than starting from the first timestamp which might start in the middle + # of a desired interval + + rng = date_range('1/1/2000 04:00:00', periods=86400, freq='s') + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + freqs = ['t', '5t', '15t', '30t', '4h', '12h'] + for freq in freqs: + result = ts[2:].resample(freq, closed='left', label='left') + expected = ts.resample(freq, closed='left', label='left') + assert_series_equal(result, expected) + + def test_resample_base(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 02:00', freq='s') + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('5min', base=2) + exp_rng = date_range('1/1/2000 00:02:00', '1/1/2000 02:02', + freq='5min') + self.assert_(resampled.index.equals(exp_rng)) + + def test_resample_daily_anchored(self): + rng = date_range('1/1/2000 0:00:00', periods=10000, freq='T') + ts = Series(np.random.randn(len(rng)), index=rng) + ts[:2] = np.nan # so results are the same + + result = ts[2:].resample('D', closed='left', label='left') + expected = ts.resample('D', closed='left', label='left') + assert_series_equal(result, expected) + + def test_resample_to_period_monthly_buglet(self): + # GH #1259 + + rng = date_range('1/1/2000','12/31/2000') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('M', kind='period') + exp_index = period_range('Jan-2000', 'Dec-2000', freq='M') + self.assert_(result.index.equals(exp_index)) + + def test_resample_empty(self): + ts = _simple_ts('1/1/2000', '2/1/2000')[:0] + + result = ts.resample('A') + self.assert_(len(result) == 0) + self.assert_(result.index.freqstr == 'A-DEC') + + result = ts.resample('A', kind='period') + self.assert_(len(result) == 0) + self.assert_(result.index.freqstr == 'A-DEC') + + def test_weekly_resample_buglet(self): + # #1327 + rng = date_range('1/1/2000', freq='B', periods=20) + ts = Series(np.random.randn(len(rng)), index=rng) + + resampled = ts.resample('W') + expected = ts.resample('W-SUN') + assert_series_equal(resampled, expected) + + def test_monthly_resample_error(self): + # #1451 + dates = date_range('4/16/2012 20:00', periods=5000, freq='h') + ts = Series(np.random.randn(len(dates)), index=dates) + # it works! + result = ts.resample('M') + + def test_resample_anchored_intraday(self): + # #1471, #1458 + + rng = date_range('1/1/2012', '4/1/2012', freq='10min') + df = DataFrame(rng.month, index=rng) + + result = df.resample('M') + expected = df.resample('M', kind='period').to_timestamp() + tm.assert_frame_equal(result, expected) + + result = df.resample('M', closed='left') + expected = df.resample('M', kind='period', closed='left').to_timestamp() + tm.assert_frame_equal(result, expected) + + rng = date_range('1/1/2012', '4/1/2013', freq='10min') + df = DataFrame(rng.month, index=rng) + + result = df.resample('Q') + expected = df.resample('Q', kind='period').to_timestamp() + tm.assert_frame_equal(result, expected) + + result = df.resample('Q', closed='left') + expected = df.resample('Q', kind='period', closed='left').to_timestamp() + tm.assert_frame_equal(result, expected) + + ts = _simple_ts('2012-04-29 23:00', '2012-04-30 5:00', freq='h') + resampled = ts.resample('M') + self.assert_(len(resampled) == 1) + + def test_resample_anchored_monthstart(self): + ts = _simple_ts('1/1/2000', '12/31/2002') + + freqs = ['MS', 'BMS', 'QS-MAR', 'AS-DEC', 'AS-JUN'] + + for freq in freqs: + result = ts.resample(freq, how='mean') + + def test_corner_cases(self): + # miscellaneous test coverage + + rng = date_range('1/1/2000', periods=12, freq='t') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('5t', closed='right', label='left') + ex_index = date_range('1999-12-31 23:55', periods=4, freq='5t') + self.assert_(result.index.equals(ex_index)) + + len0pts = _simple_pts('2007-01', '2010-05', freq='M')[:0] + # it works + result = len0pts.resample('A-DEC') + self.assert_(len(result) == 0) + + # resample to periods + ts = _simple_ts('2000-04-28', '2000-04-30 11:00', freq='h') + result = ts.resample('M', kind='period') + self.assert_(len(result) == 1) + self.assert_(result.index[0] == Period('2000-04', freq='M')) + + +def _simple_ts(start, end, freq='D'): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + +def _simple_pts(start, end, freq='D'): + rng = period_range(start, end, freq=freq) + return TimeSeries(np.random.randn(len(rng)), index=rng) + + +from pandas.tseries.frequencies import MONTHS, DAYS +from pandas.util.compat import product + + +class TestResamplePeriodIndex(unittest.TestCase): + + def test_basic_downsample(self): + ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') + result = ts.resample('a-dec') + + expected = ts.groupby(ts.index.year).mean() + expected.index = period_range('1/1/1990', '6/30/1995', + freq='a-dec') + assert_series_equal(result, expected) + + # this is ok + assert_series_equal(ts.resample('a-dec'), result) + assert_series_equal(ts.resample('a'), result) + + def test_not_subperiod(self): + # These are incompatible period rules for resampling + ts = _simple_pts('1/1/1990', '6/30/1995', freq='w-wed') + self.assertRaises(ValueError, ts.resample, 'a-dec') + self.assertRaises(ValueError, ts.resample, 'q-mar') + self.assertRaises(ValueError, ts.resample, 'M') + self.assertRaises(ValueError, ts.resample, 'w-thu') + + def test_basic_upsample(self): + ts = _simple_pts('1/1/1990', '6/30/1995', freq='M') + result = ts.resample('a-dec') + + resampled = result.resample('D', fill_method='ffill', convention='end') + + expected = result.to_timestamp('D', how='end') + expected = expected.asfreq('D', 'ffill').to_period() + + assert_series_equal(resampled, expected) + + def test_upsample_with_limit(self): + rng = period_range('1/1/2000', periods=5, freq='A') + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample('M', fill_method='ffill', limit=2) + expected = ts.asfreq('M').reindex(result.index, method='ffill', + limit=2) + assert_series_equal(result, expected) + + def test_annual_upsample(self): + targets = ['D', 'B', 'M'] + + for month in MONTHS: + ts = _simple_pts('1/1/1990', '12/31/1995', freq='A-%s' % month) + + for targ, conv, meth in product(targets, ['start', 'end'], + ['ffill', 'bfill']): + result = ts.resample(targ, fill_method=meth, + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, meth).to_period() + assert_series_equal(result, expected) + + df = DataFrame({'a' : ts}) + rdf = df.resample('D', fill_method='ffill') + exp = df['a'].resample('D', fill_method='ffill') + assert_series_equal(rdf['a'], exp) + + def test_quarterly_upsample(self): + targets = ['D', 'B', 'M'] + + for month in MONTHS: + ts = _simple_pts('1/1/1990', '12/31/1995', freq='Q-%s' % month) + + for targ, conv in product(targets, ['start', 'end']): + result = ts.resample(targ, fill_method='ffill', + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, 'ffill').to_period() + assert_series_equal(result, expected) + + def test_monthly_upsample(self): + targets = ['D', 'B'] + + ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') + + for targ, conv in product(targets, ['start', 'end']): + result = ts.resample(targ, fill_method='ffill', + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, 'ffill').to_period() + assert_series_equal(result, expected) + + def test_weekly_upsample(self): + targets = ['D', 'B'] + + for day in DAYS: + ts = _simple_pts('1/1/1990', '12/31/1995', freq='W-%s' % day) + + for targ, conv in product(targets, ['start', 'end']): + result = ts.resample(targ, fill_method='ffill', + convention=conv) + expected = result.to_timestamp(targ, how=conv) + expected = expected.asfreq(targ, 'ffill').to_period() + assert_series_equal(result, expected) + + def test_resample_to_timestamps(self): + ts = _simple_pts('1/1/1990', '12/31/1995', freq='M') + + result = ts.resample('A-DEC', kind='timestamp') + expected = ts.to_timestamp(how='end').resample('A-DEC') + assert_series_equal(result, expected) + + def test_resample_to_quarterly(self): + for month in MONTHS: + ts = _simple_pts('1990', '1992', freq='A-%s' % month) + quar_ts = ts.resample('Q-%s' % month, fill_method='ffill') + + stamps = ts.to_timestamp('D', how='end') + qdates = period_range(stamps.index[0], stamps.index[-1], + freq='Q-%s' % month) + + expected = stamps.reindex(qdates.to_timestamp('D', 'e'), + method='ffill') + expected.index = qdates + + assert_series_equal(quar_ts, expected) + + # conforms, but different month + ts = _simple_pts('1990', '1992', freq='A-JUN') + + for how in ['start', 'end']: + result = ts.resample('Q-MAR', convention=how, fill_method='ffill') + expected = ts.asfreq('Q-MAR', how=how).to_timestamp('D') + expected = expected.resample('Q-MAR', fill_method='ffill') + assert_series_equal(result, expected.to_period('Q-MAR')) + + def test_resample_fill_missing(self): + rng = PeriodIndex([2000, 2005, 2007, 2009], freq='A') + + s = TimeSeries(np.random.randn(4), index=rng) + + stamps = s.to_timestamp() + + filled = s.resample('A') + expected = stamps.resample('A').to_period('A') + assert_series_equal(filled, expected) + + filled = s.resample('A', fill_method='ffill') + expected = stamps.resample('A', fill_method='ffill').to_period('A') + assert_series_equal(filled, expected) + + def test_cant_fill_missing_dups(self): + rng = PeriodIndex([2000, 2005, 2005, 2007, 2007], freq='A') + s = TimeSeries(np.random.randn(5), index=rng) + self.assertRaises(Exception, s.resample, 'A') + + def test_resample_5minute(self): + rng = period_range('1/1/2000', '1/5/2000', freq='T') + ts = TimeSeries(np.random.randn(len(rng)), index=rng) + + result = ts.resample('5min') + expected = ts.to_timestamp().resample('5min') + assert_series_equal(result, expected) + + def test_upsample_daily_business_daily(self): + ts = _simple_pts('1/1/2000', '2/1/2000', freq='B') + + result = ts.resample('D') + expected = ts.asfreq('D').reindex(period_range('1/3/2000', '2/1/2000')) + assert_series_equal(result, expected) + + ts = _simple_pts('1/1/2000', '2/1/2000') + result = ts.resample('H', convention='s') + exp_rng = period_range('1/1/2000', '2/1/2000', freq='H') + expected = ts.asfreq('H', how='s').reindex(exp_rng) + assert_series_equal(result, expected) + + def test_resample_empty(self): + ts = _simple_pts('1/1/2000', '2/1/2000')[:0] + + result = ts.resample('A') + self.assert_(len(result) == 0) + + def test_resample_irregular_sparse(self): + dr = date_range(start='1/1/2012', freq='5min', periods=1000) + s = Series(np.array(100), index=dr) + # subset the data. + subset = s[:'2012-01-04 07:00'] + + result = subset.resample('10min', how=len) + expected = s.resample('10min', how=len).ix[result.index] + assert_series_equal(result, expected) + + def test_resample_weekly_all_na(self): + rng = date_range('1/1/2000', periods=10, freq='W-WED') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.resample('W-THU') + + self.assert_(result.isnull().all()) + + result = ts.resample('W-THU', fill_method='ffill')[:-1] + expected = ts.asfreq('W-THU', method='ffill') + assert_series_equal(result, expected) + + def test_resample_tz_localized(self): + dr = date_range(start='2012-4-13', end='2012-5-1') + ts = Series(range(len(dr)), dr) + + ts_utc = ts.tz_localize('UTC') + ts_local = ts_utc.tz_convert('America/Los_Angeles') + + result = ts_local.resample('W') + + ts_local_naive = ts_local.copy() + ts_local_naive.index = [x.replace(tzinfo=None) + for x in ts_local_naive.index.to_pydatetime()] + + exp = ts_local_naive.resample('W').tz_localize('America/Los_Angeles') + + assert_series_equal(result, exp) + + def test_closed_left_corner(self): + # #1465 + s = Series(np.random.randn(21), + index=date_range(start='1/1/2012 9:30', + freq='1min', periods=21)) + s[0] = np.nan + + result = s.resample('10min', how='mean',closed='left', label='right') + exp = s[1:].resample('10min', how='mean',closed='left', label='right') + assert_series_equal(result, exp) + + result = s.resample('10min', how='mean',closed='left', label='left') + exp = s[1:].resample('10min', how='mean',closed='left', label='left') + + ex_index = date_range(start='1/1/2012 9:30', freq='10min', periods=3) + + self.assert_(result.index.equals(ex_index)) + assert_series_equal(result, exp) + + +class TestTimeGrouper(unittest.TestCase): + + def setUp(self): + self.ts = Series(np.random.randn(1000), + index=date_range('1/1/2000', periods=1000)) + + def test_apply(self): + grouper = TimeGrouper('A', label='right', closed='right') + + grouped = self.ts.groupby(grouper) + + f = lambda x: x.order()[-3:] + + applied = grouped.apply(f) + expected = self.ts.groupby(lambda x: x.year).apply(f) + + applied.index = applied.index.droplevel(0) + expected.index = expected.index.droplevel(0) + assert_series_equal(applied, expected) + + def test_count(self): + self.ts[::3] = np.nan + + grouper = TimeGrouper('A', label='right', closed='right') + result = self.ts.resample('A', how='count') + + expected = self.ts.groupby(lambda x: x.year).count() + expected.index = result.index + + assert_series_equal(result, expected) + + def test_numpy_reduction(self): + result = self.ts.resample('A', how='prod', closed='right') + + expected = self.ts.groupby(lambda x: x.year).agg(np.prod) + expected.index = result.index + + assert_series_equal(result, expected) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py new file mode 100644 index 00000000..5f6a00a2 --- /dev/null +++ b/pandas/tseries/tests/test_timeseries.py @@ -0,0 +1,1882 @@ +# pylint: disable-msg=E1101,W0612 +from __future__ import with_statement # for Python 2.5 +import pandas.util.compat as itertools +from datetime import datetime, time, timedelta +import sys +import os +import unittest + +import nose + +import numpy as np +randn = np.random.randn + +from pandas import (Index, Series, TimeSeries, DataFrame, isnull, + date_range, Timestamp, DatetimeIndex, Int64Index, + to_datetime, bdate_range) + +from pandas.core.daterange import DateRange +import pandas.core.datetools as datetools +import pandas.tseries.offsets as offsets +import pandas.tseries.frequencies as fmod + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +from pandas.lib import NaT, iNaT +import pandas.lib as lib +import cPickle as pickle +import pandas.core.datetools as dt +from numpy.random import rand +from pandas.util.testing import assert_frame_equal +import pandas.util.py3compat as py3compat +from pandas.core.datetools import BDay +import pandas.core.common as com + + +class TestTimeSeriesDuplicates(unittest.TestCase): + + def setUp(self): + dates = [datetime(2000, 1, 2), datetime(2000, 1, 2), + datetime(2000, 1, 2), datetime(2000, 1, 3), + datetime(2000, 1, 3), datetime(2000, 1, 3), + datetime(2000, 1, 4), datetime(2000, 1, 4), + datetime(2000, 1, 4), datetime(2000, 1, 5)] + + self.dups = Series(np.random.randn(len(dates)), index=dates) + + def test_constructor(self): + self.assert_(isinstance(self.dups, TimeSeries)) + self.assert_(isinstance(self.dups.index, DatetimeIndex)) + + def test_is_unique_monotonic(self): + self.assert_(not self.dups.index.is_unique) + + def test_index_unique(self): + uniques = self.dups.index.unique() + self.assert_(uniques.dtype == 'M8[ns]') # sanity + + def test_duplicate_dates_indexing(self): + ts = self.dups + + uniques = ts.index.unique() + for date in uniques: + result = ts[date] + + mask = ts.index == date + total = (ts.index == date).sum() + expected = ts[mask] + if total > 1: + assert_series_equal(result, expected) + else: + assert_almost_equal(result, expected[0]) + + cp = ts.copy() + cp[date] = 0 + expected = Series(np.where(mask, 0, ts), index=ts.index) + assert_series_equal(cp, expected) + + self.assertRaises(KeyError, ts.__getitem__, datetime(2000, 1, 6)) + self.assertRaises(KeyError, ts.__setitem__, datetime(2000, 1, 6), 0) + + def test_range_slice(self): + idx = DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/3/2000', + '1/4/2000']) + + ts = Series(np.random.randn(len(idx)), index=idx) + + result = ts['1/2/2000':] + expected = ts[1:] + assert_series_equal(result, expected) + + result = ts['1/2/2000':'1/3/2000'] + expected = ts[1:4] + assert_series_equal(result, expected) + + def test_groupby_average_dup_values(self): + result = self.dups.groupby(level=0).mean() + expected = self.dups.groupby(self.dups.index).mean() + assert_series_equal(result, expected) + + +def assert_range_equal(left, right): + assert(left.equals(right)) + assert(left.freq == right.freq) + assert(left.tz == right.tz) + + +def _skip_if_no_pytz(): + try: + import pytz + except ImportError: + raise nose.SkipTest + + +class TestTimeSeries(unittest.TestCase): + + def test_dti_slicing(self): + dti = DatetimeIndex(start='1/1/2005', end='12/1/2005', freq='M') + dti2 = dti[[1,3,5]] + + v1 = dti2[0] + v2 = dti2[1] + v3 = dti2[2] + + self.assertEquals(v1, Timestamp('2/28/2005')) + self.assertEquals(v2, Timestamp('4/30/2005')) + self.assertEquals(v3, Timestamp('6/30/2005')) + + # don't carry freq through irregular slicing + self.assert_(dti2.freq is None) + + def test_pass_datetimeindex_to_index(self): + # Bugs in #1396 + + rng = date_range('1/1/2000', '3/1/2000') + idx = Index(rng, dtype=object) + + expected = Index(rng.to_pydatetime(), dtype=object) + + self.assert_(np.array_equal(idx.values, expected.values)) + + def test_contiguous_boolean_preserve_freq(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + mask = np.zeros(len(rng), dtype=bool) + mask[10:20] = True + + masked = rng[mask] + expected = rng[10:20] + self.assert_(expected.freq is not None) + assert_range_equal(masked, expected) + + mask[22] = True + masked = rng[mask] + self.assert_(masked.freq is None) + + def test_getitem_median_slice_bug(self): + index = date_range('20090415', '20090519', freq='2B') + s = Series(np.random.randn(13), index=index) + + indexer = [slice(6, 7, None)] + result = s[indexer] + expected = s[indexer[0]] + assert_series_equal(result, expected) + + def test_series_box_timestamp(self): + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng) + + self.assert_(isinstance(s[5], Timestamp)) + + rng = date_range('20090415', '20090519', freq='B') + s = Series(rng, index=rng) + self.assert_(isinstance(s[5], Timestamp)) + + def test_timestamp_to_datetime(self): + _skip_if_no_pytz() + rng = date_range('20090415', '20090519', + tz='US/Eastern') + + stamp = rng[0] + dtval = stamp.to_pydatetime() + self.assertEquals(stamp, dtval) + self.assertEquals(stamp.tzinfo, dtval.tzinfo) + + def test_index_convert_to_datetime_array(self): + _skip_if_no_pytz() + + def _check_rng(rng): + converted = rng.to_pydatetime() + self.assert_(isinstance(converted, np.ndarray)) + for x, stamp in zip(converted, rng): + self.assert_(type(x) is datetime) + self.assertEquals(x, stamp.to_pydatetime()) + self.assertEquals(x.tzinfo, stamp.tzinfo) + + rng = date_range('20090415', '20090519') + rng_eastern = date_range('20090415', '20090519', tz='US/Eastern') + rng_utc = date_range('20090415', '20090519', tz='utc') + + _check_rng(rng) + _check_rng(rng_eastern) + _check_rng(rng_utc) + + def test_series_ctor_plus_datetimeindex(self): + rng = date_range('20090415', '20090519', freq='B') + data = dict((k, 1) for k in rng) + + result = Series(data, index=rng) + self.assert_(result.index is rng) + + def test_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index, method='pad', limit=5) + + expected = s[:2].reindex(index).fillna(method='pad') + expected[-3:] = np.nan + assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method='backfill', limit=5) + + expected = s[-2:].reindex(index).fillna(method='backfill') + expected[:3] = np.nan + assert_series_equal(result, expected) + + def test_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + result = s[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = s[:2].reindex(index).fillna(method='pad') + expected[-3:] = np.nan + assert_series_equal(result, expected) + + result = s[-2:].reindex(index) + result = result.fillna(method='bfill', limit=5) + + expected = s[-2:].reindex(index).fillna(method='backfill') + expected[:3] = np.nan + assert_series_equal(result, expected) + + def test_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index, method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index, method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + + result = df[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = df[:2].reindex(index).fillna(method='pad') + expected.values[-3:] = np.nan + tm.assert_frame_equal(result, expected) + + result = df[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = df[-2:].reindex(index).fillna(method='backfill') + expected.values[:3] = np.nan + tm.assert_frame_equal(result, expected) + + def test_sparse_series_fillna_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + + ss = s[:2].reindex(index).to_sparse() + result = ss.fillna(method='pad', limit=5) + expected = ss.fillna(method='pad', limit=5) + expected = expected.to_dense() + expected[-3:] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + ss = s[-2:].reindex(index).to_sparse() + result = ss.fillna(method='backfill', limit=5) + expected = ss.fillna(method='backfill') + expected = expected.to_dense() + expected[:3] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + def test_sparse_series_pad_backfill_limit(self): + index = np.arange(10) + s = Series(np.random.randn(10), index=index) + s = s.to_sparse() + + result = s[:2].reindex(index, method='pad', limit=5) + expected = s[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected[-3:] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + result = s[-2:].reindex(index, method='backfill', limit=5) + expected = s[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected[:3] = np.nan + expected = expected.to_sparse() + assert_series_equal(result, expected) + + def test_sparse_frame_pad_backfill_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index, method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index, method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_sparse_frame_fillna_limit(self): + index = np.arange(10) + df = DataFrame(np.random.randn(10, 4), index=index) + sdf = df.to_sparse() + + result = sdf[:2].reindex(index) + result = result.fillna(method='pad', limit=5) + + expected = sdf[:2].reindex(index).fillna(method='pad') + expected = expected.to_dense() + expected.values[-3:] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + result = sdf[-2:].reindex(index) + result = result.fillna(method='backfill', limit=5) + + expected = sdf[-2:].reindex(index).fillna(method='backfill') + expected = expected.to_dense() + expected.values[:3] = np.nan + expected = expected.to_sparse() + tm.assert_frame_equal(result, expected) + + def test_pad_require_monotonicity(self): + rng = date_range('1/1/2000', '3/1/2000', freq='B') + + rng2 = rng[::2][::-1] + + self.assertRaises(AssertionError, rng2.get_indexer, rng, + method='pad') + + def test_frame_ctor_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', + freq='10s') + dates = np.asarray(rng) + + df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) + self.assert_(np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))) + + def test_frame_add_datetime64_column(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', + freq='10s') + df = DataFrame(index=np.arange(len(rng))) + + df['A'] = rng + self.assert_(np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))) + + def test_frame_datetime64_pre1900_repr(self): + df = DataFrame({'year': date_range('1/1/1700', periods=50, + freq='A-DEC')}) + # it works! + repr(df) + + def test_frame_add_datetime64_col_other_units(self): + n = 100 + + units = ['h', 'm', 's', 'ms', 'D', 'M', 'Y'] + + ns_dtype = np.dtype('M8[ns]') + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + df = DataFrame({'ints' : np.arange(n)}, index=np.arange(n)) + df[unit] = vals + + ex_vals = to_datetime(vals.astype('O')) + + self.assert_(df[unit].dtype == ns_dtype) + self.assert_((df[unit].values == ex_vals).all()) + + # Test insertion into existing datetime64 column + df = DataFrame({'ints' : np.arange(n)}, index=np.arange(n)) + df['dates'] = np.arange(n, dtype=np.int64).view(ns_dtype) + + for unit in units: + dtype = np.dtype('M8[%s]' % unit) + vals = np.arange(n, dtype=np.int64).view(dtype) + + tmp = df.copy() + + tmp['dates'] = vals + ex_vals = to_datetime(vals.astype('O')) + + self.assert_((tmp['dates'].values == ex_vals).all()) + + def test_series_ctor_datetime64(self): + rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', + freq='10s') + dates = np.asarray(rng) + + series = Series(dates) + self.assert_(np.issubdtype(series.dtype, np.dtype('M8[ns]'))) + + def test_index_cast_datetime64_other_units(self): + arr = np.arange(0, 100, 10, dtype=np.int64).view('M8[D]') + + idx = Index(arr) + + self.assert_((idx.values == lib.cast_to_nanoseconds(arr)).all()) + + def test_index_astype_datetime64(self): + idx = Index([datetime(2012, 1, 1)], dtype=object) + + if np.__version__ >= '1.7': + raise nose.SkipTest + + casted = idx.astype(np.dtype('M8[D]')) + expected = DatetimeIndex(idx.values) + self.assert_(isinstance(casted, DatetimeIndex)) + self.assert_(casted.equals(expected)) + + def test_reindex_series_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + series = Series(rng) + + result = series.reindex(range(15)) + self.assert_(np.issubdtype(result.dtype, np.dtype('M8[ns]'))) + + mask = result.isnull() + self.assert_(mask[-5:].all()) + self.assert_(not mask[:-5].any()) + + def test_reindex_frame_add_nat(self): + rng = date_range('1/1/2000 00:00:00', periods=10, freq='10s') + df = DataFrame({'A': np.random.randn(len(rng)), 'B': rng}) + + result = df.reindex(range(15)) + self.assert_(np.issubdtype(result['B'].dtype, np.dtype('M8[ns]'))) + + mask = com.isnull(result)['B'] + self.assert_(mask[-5:].all()) + self.assert_(not mask[:-5].any()) + + def test_series_repr_nat(self): + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + + result = repr(series) + expected = ('0 1970-01-01 00:00:00\n' + '1 1970-01-01 00:00:00.000001\n' + '2 1970-01-01 00:00:00.000002\n' + '3 NaT') + self.assertEquals(result, expected) + + def test_fillna_nat(self): + series = Series([0, 1, 2, iNaT], dtype='M8[ns]') + + filled = series.fillna(method='pad') + filled2 = series.fillna(value=series.values[2]) + + expected = series.copy() + expected.values[3] = expected.values[2] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='pad') + filled2 = df.fillna(value=series.values[2]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + + series = Series([iNaT, 0, 1, 2], dtype='M8[ns]') + + filled = series.fillna(method='bfill') + filled2 = series.fillna(value=series[1]) + + expected = series.copy() + expected[0] = expected[1] + + assert_series_equal(filled, expected) + assert_series_equal(filled2, expected) + + df = DataFrame({'A': series}) + filled = df.fillna(method='bfill') + filled2 = df.fillna(value=series[1]) + expected = DataFrame({'A': expected}) + assert_frame_equal(filled, expected) + assert_frame_equal(filled2, expected) + + def test_string_na_nat_conversion(self): + # GH #999, #858 + + from dateutil.parser import parse + + strings = np.array(['1/1/2000', '1/2/2000', np.nan, + '1/4/2000, 12:34:56'], dtype=object) + + expected = np.empty(4, dtype='M8[ns]') + for i, val in enumerate(strings): + if com.isnull(val): + expected[i] = iNaT + else: + expected[i] = parse(val) + + result = lib.array_to_datetime(strings) + assert_almost_equal(result, expected) + + result2 = to_datetime(strings) + self.assert_(isinstance(result2, DatetimeIndex)) + assert_almost_equal(result, result2) + + malformed = np.array(['1/100/2000', np.nan], dtype=object) + result = to_datetime(malformed) + assert_almost_equal(result, malformed) + + self.assertRaises(ValueError, to_datetime, malformed, + errors='raise') + + idx = ['a', 'b', 'c', 'd', 'e'] + series = Series(['1/1/2000', np.nan, '1/3/2000', np.nan, + '1/5/2000'], index=idx, name='foo') + dseries = Series([to_datetime('1/1/2000'), np.nan, + to_datetime('1/3/2000'), np.nan, + to_datetime('1/5/2000')], index=idx, name='foo') + + result = to_datetime(series) + dresult = to_datetime(dseries) + + expected = Series(np.empty(5, dtype='M8[ns]'), index=idx) + for i in range(5): + x = series[i] + if isnull(x): + expected[i] = iNaT + else: + expected[i] = to_datetime(x) + + assert_series_equal(result, expected) + self.assertEquals(result.name, 'foo') + + assert_series_equal(dresult, expected) + self.assertEquals(dresult.name, 'foo') + + def test_nat_vector_field_access(self): + idx = DatetimeIndex(['1/1/2000', None, None, '1/4/2000']) + + fields = ['year', 'quarter', 'month', 'day', 'hour', + 'minute', 'second', 'microsecond', 'nanosecond', + 'week', 'dayofyear'] + for field in fields: + result = getattr(idx, field) + expected = [getattr(x, field) if x is not NaT else -1 + for x in idx] + self.assert_(np.array_equal(result, expected)) + + def test_nat_scalar_field_access(self): + fields = ['year', 'quarter', 'month', 'day', 'hour', + 'minute', 'second', 'microsecond', 'nanosecond', + 'week', 'dayofyear'] + for field in fields: + result = getattr(NaT, field) + self.assertEquals(result, -1) + + self.assertEquals(NaT.weekday(), -1) + + def test_to_datetime_empty_string(self): + result = to_datetime('') + self.assert_(result == '') + + result = to_datetime(['', '']) + self.assert_(isnull(result).all()) + + def test_to_datetime_other_datetime64_units(self): + # 5/25/2012 + scalar = np.int64(1337904000000000).view('M8[us]') + as_obj = scalar.astype('O') + + index = DatetimeIndex([scalar]) + self.assertEquals(index[0], scalar.astype('O')) + + value = Timestamp(scalar) + self.assertEquals(value, as_obj) + + def test_to_datetime_list_of_integers(self): + rng = date_range('1/1/2000', periods=20) + rng = DatetimeIndex(rng.values) + + ints = list(rng.asi8) + + result = DatetimeIndex(ints) + + self.assert_(rng.equals(result)) + + def test_index_to_datetime(self): + idx = Index(['1/1/2000', '1/2/2000', '1/3/2000']) + + result = idx.to_datetime() + expected = DatetimeIndex(datetools.to_datetime(idx.values)) + self.assert_(result.equals(expected)) + + today = datetime.today() + idx = Index([today], dtype=object) + result = idx.to_datetime() + expected = DatetimeIndex([today]) + self.assert_(result.equals(expected)) + + def test_to_datetime_freq(self): + xp = bdate_range('2000-1-1', periods=10, tz='UTC') + rs = xp.to_datetime() + self.assert_(xp.freq == rs.freq) + self.assert_(xp.tzinfo == rs.tzinfo) + + def test_range_misspecified(self): + # GH #1095 + + self.assertRaises(ValueError, date_range, '1/1/2000') + self.assertRaises(ValueError, date_range, end='1/1/2000') + self.assertRaises(ValueError, date_range, periods=10) + + self.assertRaises(ValueError, date_range, '1/1/2000', freq='H') + self.assertRaises(ValueError, date_range, end='1/1/2000', freq='H') + self.assertRaises(ValueError, date_range, periods=10, freq='H') + + def test_reasonable_keyerror(self): + # GH #1062 + index = DatetimeIndex(['1/3/2000']) + try: + index.get_loc('1/1/2000') + except KeyError, e: + self.assert_('2000' in str(e)) + + def test_reindex_with_datetimes(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + result = ts.reindex(list(ts.index[5:10])) + expected = ts[5:10] + tm.assert_series_equal(result, expected) + + result = ts[list(ts.index[5:10])] + tm.assert_series_equal(result, expected) + + def test_promote_datetime_date(self): + rng = date_range('1/1/2000', periods=20) + ts = Series(np.random.randn(20), index=rng) + + ts_slice = ts[5:] + ts2 = ts_slice.copy() + ts2.index = [x.date() for x in ts2.index] + + result = ts + ts2 + result2 = ts2 + ts + expected = ts + ts[5:] + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + # test asfreq + result = ts2.asfreq('4H', method='ffill') + expected = ts[5:].asfreq('4H', method='ffill') + assert_series_equal(result, expected) + + result = rng.get_indexer(ts2.index) + expected = rng.get_indexer(ts_slice.index) + self.assert_(np.array_equal(result, expected)) + + def test_date_range_gen_error(self): + rng = date_range('1/1/2000 00:00', '1/1/2000 00:18', freq='5min') + self.assertEquals(len(rng), 4) + + def test_first_subset(self): + ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') + result = ts.first('10d') + self.assert_(len(result) == 20) + + ts = _simple_ts('1/1/2000', '1/1/2010') + result = ts.first('10d') + self.assert_(len(result) == 10) + + result = ts.first('3M') + expected = ts[:'3/31/2000'] + assert_series_equal(result, expected) + + result = ts.first('21D') + expected = ts[:21] + assert_series_equal(result, expected) + + result = ts[:0].first('3M') + assert_series_equal(result, ts[:0]) + + def test_last_subset(self): + ts = _simple_ts('1/1/2000', '1/1/2010', freq='12h') + result = ts.last('10d') + self.assert_(len(result) == 20) + + ts = _simple_ts('1/1/2000', '1/1/2010') + result = ts.last('10d') + self.assert_(len(result) == 10) + + result = ts.last('21D') + expected = ts['12/12/2009':] + assert_series_equal(result, expected) + + result = ts.last('21D') + expected = ts[-21:] + assert_series_equal(result, expected) + + result = ts[:0].last('3M') + assert_series_equal(result, ts[:0]) + + def test_add_offset(self): + rng = date_range('1/1/2000', '2/1/2000') + + result = rng + offsets.Hour(2) + expected = date_range('1/1/2000 02:00', '2/1/2000 02:00') + self.assert_(result.equals(expected)) + + def test_format_pre_1900_dates(self): + rng = date_range('1/1/1850', '1/1/1950', freq='A-DEC') + rng.format() + ts = Series(1, index=rng) + repr(ts) + + def test_repeat(self): + rng = date_range('1/1/2000', '1/1/2001') + + result = rng.repeat(5) + self.assert_(result.freq is None) + self.assert_(len(result) == 5 * len(rng)) + + def test_at_time(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + rs = ts.at_time(rng[1]) + self.assert_((rs.index.hour == rng[1].hour).all()) + self.assert_((rs.index.minute == rng[1].minute).all()) + self.assert_((rs.index.second == rng[1].second).all()) + + result = ts.at_time('9:30') + expected = ts.at_time(time(9, 30)) + assert_series_equal(result, expected) + + df = DataFrame(np.random.randn(len(rng), 3), index=rng) + + result = ts[time(9, 30)] + result_df = df.ix[time(9, 30)] + expected = ts[(rng.hour == 9) & (rng.minute == 30)] + exp_df = df[(rng.hour == 9) & (rng.minute == 30)] + + # expected.index = date_range('1/1/2000', '1/4/2000') + + assert_series_equal(result, expected) + tm.assert_frame_equal(result_df, exp_df) + + chunk = df.ix['1/4/2000':] + result = chunk.ix[time(9, 30)] + expected = result_df[-1:] + tm.assert_frame_equal(result, expected) + + # midnight, everything + rng = date_range('1/1/2000', '1/31/2000') + ts = Series(np.random.randn(len(rng)), index=rng) + + result = ts.at_time(time(0, 0)) + assert_series_equal(result, ts) + + # time doesn't exist + rng = date_range('1/1/2012', freq='23Min', periods=384) + ts = Series(np.random.randn(len(rng)), rng) + rs = ts.at_time('16:00') + self.assert_(len(rs) == 0) + + def test_between_time(self): + rng = date_range('1/1/2000', '1/5/2000', freq='5min') + ts = Series(np.random.randn(len(rng)), index=rng) + stime = time(0, 0) + etime = time(1, 0) + + close_open = itertools.product([True, False], [True, False]) + for inc_start, inc_end in close_open: + filtered = ts.between_time(stime, etime, inc_start, inc_end) + exp_len = 13 * 4 + 1 + if not inc_start: + exp_len -= 5 + if not inc_end: + exp_len -= 4 + + self.assert_(len(filtered) == exp_len) + for rs in filtered.index: + t = rs.time() + if inc_start: + self.assert_(t >= stime) + else: + self.assert_(t > stime) + + if inc_end: + self.assert_(t <= etime) + else: + self.assert_(t < etime) + + result = ts.between_time('00:00', '01:00') + expected = ts.between_time(stime, etime) + assert_series_equal(result, expected) + + def test_dti_constructor_preserve_dti_freq(self): + rng = date_range('1/1/2000', '1/2/2000', freq='5min') + + rng2 = DatetimeIndex(rng) + self.assert_(rng.freq == rng2.freq) + + def test_normalize(self): + rng = date_range('1/1/2000 9:30', periods=10, freq='D') + + result = rng.normalize() + expected = date_range('1/1/2000', periods=10, freq='D') + self.assert_(result.equals(expected)) + + self.assert_(result.is_normalized) + self.assert_(not rng.is_normalized) + + def test_to_period(self): + from pandas.tseries.period import period_range + + ts = _simple_ts('1/1/2000', '1/1/2001') + + pts = ts.to_period() + exp = ts.copy() + exp.index = period_range('1/1/2000', '1/1/2001') + assert_series_equal(pts, exp) + + pts = ts.to_period('M') + self.assert_(pts.index.equals(exp.index.asfreq('M'))) + + def test_frame_to_period(self): + K = 5 + from pandas.tseries.period import period_range + + dr = date_range('1/1/2000', '1/1/2001') + pr = period_range('1/1/2000', '1/1/2001') + df = DataFrame(randn(len(dr), K), index=dr) + df['mix'] = 'a' + + pts = df.to_period() + exp = df.copy() + exp.index = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M') + self.assert_(pts.index.equals(exp.index.asfreq('M'))) + + df = df.T + pts = df.to_period(axis=1) + exp = df.copy() + exp.columns = pr + assert_frame_equal(pts, exp) + + pts = df.to_period('M', axis=1) + self.assert_(pts.columns.equals(exp.columns.asfreq('M'))) + + self.assertRaises(ValueError, df.to_period, axis=2) + + def test_timestamp_fields(self): + # extra fields from DatetimeIndex like quarter and week + from pandas.lib import Timestamp + idx = tm.makeDateIndex(10) + + fields = ['dayofweek', 'dayofyear', 'week', 'weekofyear', 'quarter'] + for f in fields: + expected = getattr(idx, f)[0] + result = getattr(Timestamp(idx[0]), f) + self.assertEqual(result, expected) + + self.assertEqual(idx.freq, Timestamp(idx[0], idx.freq).freq) + self.assertEqual(idx.freqstr, Timestamp(idx[0], idx.freq).freqstr) + + def test_timestamp_date_out_of_range(self): + self.assertRaises(ValueError, Timestamp, '1676-01-01') + self.assertRaises(ValueError, Timestamp, '2263-01-01') + + # 1475 + self.assertRaises(ValueError, DatetimeIndex, ['1400-01-01']) + self.assertRaises(ValueError, DatetimeIndex, [datetime(1400, 1, 1)]) + + def test_timestamp_repr(self): + # pre-1900 + stamp = Timestamp('1850-01-01', tz='US/Eastern') + repr(stamp) + + iso8601 = '1850-01-01 01:23:45.012345' + stamp = Timestamp(iso8601, tz='US/Eastern') + result = repr(stamp) + self.assert_(iso8601 in result) + + def test_datetimeindex_integers_shift(self): + rng = date_range('1/1/2000', periods=20) + + result = rng + 5 + expected = rng.shift(5) + self.assert_(result.equals(expected)) + + result = rng - 5 + expected = rng.shift(-5) + self.assert_(result.equals(expected)) + + def test_astype_object(self): + # NumPy 1.6.1 weak ns support + rng = date_range('1/1/2000', periods=20) + + casted = rng.astype('O') + exp_values = list(rng) + + self.assert_(np.array_equal(casted, exp_values)) + + def test_catch_infinite_loop(self): + offset = datetools.DateOffset(minute=5) + # blow up, don't loop forever + self.assertRaises(Exception, date_range, datetime(2011,11,11), + datetime(2011,11,12), freq=offset) + + def test_append_concat(self): + rng = date_range('5/8/2012 1:45', periods=10, freq='5T') + ts = Series(np.random.randn(len(rng)), rng) + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + + result = ts.append(ts) + result_df = df.append(df) + ex_index = DatetimeIndex(np.tile(rng.values, 2)) + self.assert_(result.index.equals(ex_index)) + self.assert_(result_df.index.equals(ex_index)) + + appended = rng.append(rng) + self.assert_(appended.equals(ex_index)) + + appended = rng.append([rng, rng]) + ex_index = DatetimeIndex(np.tile(rng.values, 3)) + self.assert_(appended.equals(ex_index)) + + # different index names + rng1 = rng.copy() + rng2 = rng.copy() + rng1.name = 'foo' + rng2.name = 'bar' + self.assert_(rng1.append(rng1).name == 'foo') + self.assert_(rng1.append(rng2).name is None) + + def test_set_dataframe_column_ns_dtype(self): + x = DataFrame([datetime.now(), datetime.now()]) + self.assert_(x[0].dtype == object) + + x[0] = to_datetime(x[0]) + self.assert_(x[0].dtype == np.dtype('M8[ns]')) + + def test_groupby_count_dateparseerror(self): + dr = date_range(start='1/1/2012', freq='5min', periods=10) + + # BAD Example, datetimes first + s = Series(np.arange(10), index=[dr, range(10)]) + grouped = s.groupby(lambda x: x[1] % 2 == 0) + result = grouped.count() + + s = Series(np.arange(10), index=[range(10), dr]) + grouped = s.groupby(lambda x: x[0] % 2 == 0) + expected = grouped.count() + + assert_series_equal(result, expected) + + +def _simple_ts(start, end, freq='D'): + rng = date_range(start, end, freq=freq) + return Series(np.random.randn(len(rng)), index=rng) + + +class TestDatetimeIndex(unittest.TestCase): + + def test_append_join_nondatetimeindex(self): + rng = date_range('1/1/2000', periods=10) + idx = Index(['a', 'b', 'c', 'd']) + + result = rng.append(idx) + self.assert_(isinstance(result[0], Timestamp)) + + # it works + rng.join(idx, how='outer') + + def test_astype(self): + rng = date_range('1/1/2000', periods=10) + + result = rng.astype('i8') + self.assert_(np.array_equal(result, rng.asi8)) + + def test_to_period_nofreq(self): + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.to_period) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03'], + freq='infer') + idx.to_period() + + def test_constructor_coverage(self): + rng = date_range('1/1/2000', periods=10.5) + exp = date_range('1/1/2000', periods=10) + self.assert_(rng.equals(exp)) + + self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', + periods='foo', freq='D') + + self.assertRaises(ValueError, DatetimeIndex, start='1/1/2000', + end='1/10/2000') + + self.assertRaises(ValueError, DatetimeIndex, '1/1/2000') + + # generator expression + gen = (datetime(2000, 1, 1) + timedelta(i) for i in range(10)) + result = DatetimeIndex(gen) + expected = DatetimeIndex([datetime(2000, 1, 1) + timedelta(i) + for i in range(10)]) + self.assert_(result.equals(expected)) + + # NumPy string array + strings = np.array(['2000-01-01', '2000-01-02', '2000-01-03']) + result = DatetimeIndex(strings) + expected = DatetimeIndex(strings.astype('O')) + self.assert_(result.equals(expected)) + + from_ints = DatetimeIndex(expected.asi8) + self.assert_(from_ints.equals(expected)) + + # non-conforming + self.assertRaises(ValueError, DatetimeIndex, + ['2000-01-01', '2000-01-02', '2000-01-04'], + freq='D') + + self.assertRaises(ValueError, DatetimeIndex, + start='2011-01-01', freq='b') + self.assertRaises(ValueError, DatetimeIndex, + end='2011-01-01', freq='B') + self.assertRaises(ValueError, DatetimeIndex, periods=10, freq='D') + + def test_comparisons_coverage(self): + rng = date_range('1/1/2000', periods=10) + + # raise TypeError for now + self.assertRaises(TypeError, rng.__lt__, rng[3].value) + + result = rng == list(rng) + exp = rng == rng + self.assert_(np.array_equal(result, exp)) + + def test_map(self): + rng = date_range('1/1/2000', periods=10) + + f = lambda x: x.strftime('%Y%m%d') + result = rng.map(f) + exp = [f(x) for x in rng] + self.assert_(np.array_equal(result, exp)) + + def test_add_union(self): + rng = date_range('1/1/2000', periods=5) + rng2 = date_range('1/6/2000', periods=5) + + result = rng + rng2 + expected = rng.union(rng2) + self.assert_(result.equals(expected)) + + def test_misc_coverage(self): + rng = date_range('1/1/2000', periods=5) + result = rng.groupby(rng.day) + self.assert_(isinstance(result.values()[0][0], Timestamp)) + + idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) + self.assert_(idx.equals(list(idx))) + + non_datetime = Index(list('abc')) + self.assert_(not idx.equals(list(non_datetime))) + + def test_union_coverage(self): + idx = DatetimeIndex(['2000-01-03', '2000-01-01', '2000-01-02']) + ordered = DatetimeIndex(idx.order(), freq='infer') + result = ordered.union(idx) + self.assert_(result.equals(ordered)) + + result = ordered[:0].union(ordered) + self.assert_(result.equals(ordered)) + self.assert_(result.freq == ordered.freq) + + # def test_add_timedelta64(self): + # rng = date_range('1/1/2000', periods=5) + # delta = rng.values[3] - rng.values[1] + + # result = rng + delta + # expected = rng + timedelta(2) + # self.assert_(result.equals(expected)) + + def test_get_duplicates(self): + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-02', + '2000-01-03', '2000-01-03', '2000-01-04']) + + result = idx.get_duplicates() + ex = DatetimeIndex(['2000-01-02', '2000-01-03']) + self.assert_(result.equals(ex)) + + def test_argmin_argmax(self): + idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) + self.assertEqual(idx.argmin(), 1) + self.assertEqual(idx.argmax(), 0) + + def test_order(self): + idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) + + ordered = idx.order() + self.assert_(ordered.is_monotonic) + + ordered = idx.order(ascending=False) + self.assert_(ordered[::-1].is_monotonic) + + ordered, dexer = idx.order(return_indexer=True) + self.assert_(ordered.is_monotonic) + self.assert_(np.array_equal(dexer, [1, 2, 0])) + + ordered, dexer = idx.order(return_indexer=True, ascending=False) + self.assert_(ordered[::-1].is_monotonic) + self.assert_(np.array_equal(dexer, [0, 2, 1])) + + def test_insert(self): + idx = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-02']) + + result = idx.insert(2, datetime(2000, 1, 5)) + exp = DatetimeIndex(['2000-01-04', '2000-01-01', '2000-01-05', + '2000-01-02']) + self.assert_(result.equals(exp)) + + idx = date_range('1/1/2000', periods=3, freq='M') + result = idx.insert(3, datetime(2000, 4, 30)) + self.assert_(result.freqstr == 'M') + +class TestLegacySupport(unittest.TestCase): + + @classmethod + def setUpClass(cls): + if py3compat.PY3: + raise nose.SkipTest + + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'frame.pickle') + + with open(filepath, 'rb') as f: + cls.frame = pickle.load(f) + + filepath = os.path.join(pth, 'data', 'series.pickle') + with open(filepath, 'rb') as f: + cls.series = pickle.load(f) + + def test_pass_offset_warn(self): + from StringIO import StringIO + import sys + buf = StringIO() + + sys.stderr = buf + DatetimeIndex(start='1/1/2000', periods=10, offset='H') + sys.stderr = sys.__stderr__ + + def test_unpickle_legacy_frame(self): + dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', + freq=BDay(1)) + + unpickled = self.frame + + self.assertEquals(type(unpickled.index), DatetimeIndex) + self.assertEquals(len(unpickled), 10) + self.assert_((unpickled.columns == Int64Index(np.arange(5))).all()) + self.assert_((unpickled.index == dtindex).all()) + self.assertEquals(unpickled.index.offset, BDay(1, normalize=True)) + + def test_unpickle_legacy_series(self): + from pandas.core.datetools import BDay + + unpickled = self.series + + dtindex = DatetimeIndex(start='1/3/2005', end='1/14/2005', + freq=BDay(1)) + + self.assertEquals(type(unpickled.index), DatetimeIndex) + self.assertEquals(len(unpickled), 10) + self.assert_((unpickled.index == dtindex).all()) + self.assertEquals(unpickled.index.offset, BDay(1, normalize=True)) + + def test_unpickle_legacy_len0_daterange(self): + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'series_daterange0.pickle') + + result = com.load(filepath) + + ex_index = DatetimeIndex([], freq='B') + + self.assert_(result.index.equals(ex_index)) + self.assert_(isinstance(result.index.freq, offsets.BDay)) + self.assert_(len(result) == 0) + + def test_arithmetic_interaction(self): + index = self.frame.index + obj_index = index.asobject + + dseries = Series(rand(len(index)), index=index) + oseries = Series(dseries.values, index=obj_index) + + result = dseries + oseries + expected = dseries * 2 + self.assert_(isinstance(result.index, DatetimeIndex)) + assert_series_equal(result, expected) + + result = dseries + oseries[:5] + expected = dseries + dseries[:5] + self.assert_(isinstance(result.index, DatetimeIndex)) + assert_series_equal(result, expected) + + def test_join_interaction(self): + index = self.frame.index + obj_index = index.asobject + + def _check_join(left, right, how='inner'): + ra, rb, rc = left.join(right, how=how, return_indexers=True) + ea, eb, ec = left.join(DatetimeIndex(right), how=how, + return_indexers=True) + + self.assert_(isinstance(ra, DatetimeIndex)) + self.assert_(ra.equals(ea)) + + assert_almost_equal(rb, eb) + assert_almost_equal(rc, ec) + + _check_join(index[:15], obj_index[5:], how='inner') + _check_join(index[:15], obj_index[5:], how='outer') + _check_join(index[:15], obj_index[5:], how='right') + _check_join(index[:15], obj_index[5:], how='left') + + def test_unpickle_daterange(self): + pth, _ = os.path.split(os.path.abspath(__file__)) + filepath = os.path.join(pth, 'data', 'daterange_073.pickle') + + rng = com.load(filepath) + self.assert_(type(rng[0]) == datetime) + self.assert_(isinstance(rng.offset, offsets.BDay)) + self.assert_(rng.values.dtype == object) + + def test_setops(self): + index = self.frame.index + obj_index = index.asobject + + result = index[:5].union(obj_index[5:]) + expected = index + self.assert_(isinstance(result, DatetimeIndex)) + self.assert_(result.equals(expected)) + + result = index[:10].intersection(obj_index[5:]) + expected = index[5:10] + self.assert_(isinstance(result, DatetimeIndex)) + self.assert_(result.equals(expected)) + + result = index[:10] - obj_index[5:] + expected = index[:5] + self.assert_(isinstance(result, DatetimeIndex)) + self.assert_(result.equals(expected)) + + def test_index_conversion(self): + index = self.frame.index + obj_index = index.asobject + + conv = DatetimeIndex(obj_index) + self.assert_(conv.equals(index)) + + self.assertRaises(ValueError, DatetimeIndex, ['a', 'b', 'c', 'd']) + + def test_tolist(self): + rng = date_range('1/1/2000', periods=10) + + result = rng.tolist() + self.assert_(isinstance(result[0], Timestamp)) + + def test_object_convert_fail(self): + idx = DatetimeIndex([NaT]) + self.assertRaises(ValueError, idx.astype, 'O') + + def test_setops_conversion_fail(self): + index = self.frame.index + + right = Index(['a', 'b', 'c', 'd']) + + result = index.union(right) + expected = Index(np.concatenate([index.asobject, right])) + self.assert_(result.equals(expected)) + + result = index.intersection(right) + expected = Index([]) + self.assert_(result.equals(expected)) + + def test_legacy_time_rules(self): + rules = [('WEEKDAY', 'B'), + ('EOM', 'BM'), + ('W@MON', 'W-MON'), ('W@TUE', 'W-TUE'), ('W@WED', 'W-WED'), + ('W@THU', 'W-THU'), ('W@FRI', 'W-FRI'), + ('Q@JAN', 'BQ-JAN'), ('Q@FEB', 'BQ-FEB'), ('Q@MAR', 'BQ-MAR'), + ('A@JAN', 'BA-JAN'), ('A@FEB', 'BA-FEB'), ('A@MAR', 'BA-MAR'), + ('A@APR', 'BA-APR'), ('A@MAY', 'BA-MAY'), ('A@JUN', 'BA-JUN'), + ('A@JUL', 'BA-JUL'), ('A@AUG', 'BA-AUG'), ('A@SEP', 'BA-SEP'), + ('A@OCT', 'BA-OCT'), ('A@NOV', 'BA-NOV'), ('A@DEC', 'BA-DEC'), + ('WOM@1FRI', 'WOM-1FRI'), ('WOM@2FRI', 'WOM-2FRI'), + ('WOM@3FRI', 'WOM-3FRI'), ('WOM@4FRI', 'WOM-4FRI')] + + start, end = '1/1/2000', '1/1/2010' + + for old_freq, new_freq in rules: + old_rng = date_range(start, end, freq=old_freq) + new_rng = date_range(start, end, freq=new_freq) + self.assert_(old_rng.equals(new_rng)) + + # test get_legacy_offset_name + offset = datetools.get_offset(new_freq) + old_name = datetools.get_legacy_offset_name(offset) + self.assertEquals(old_name, old_freq) + + def test_ms_vs_MS(self): + left = datetools.get_offset('ms') + right = datetools.get_offset('MS') + self.assert_(left == datetools.Milli()) + self.assert_(right == datetools.MonthBegin()) + + def test_rule_aliases(self): + rule = datetools.to_offset('10us') + self.assert_(rule == datetools.Micro(10)) + + def test_slice_year(self): + dti = DatetimeIndex(freq='B', start=datetime(2005,1,1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + result = s['2005'] + expected = s[s.index.year == 2005] + assert_series_equal(result, expected) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + result = df.ix['2005'] + expected = df[df.index.year == 2005] + assert_frame_equal(result, expected) + + rng = date_range('1/1/2000', '1/1/2010') + + result = rng.get_loc('2009') + expected = slice(3288, 3653) + self.assert_(result == expected) + + def test_slice_quarter(self): + dti = DatetimeIndex(freq='D', start=datetime(2000,6,1), periods=500) + + s = Series(np.arange(len(dti)), index=dti) + self.assertEquals(len(s['2001Q1']), 90) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEquals(len(df.ix['1Q01']), 90) + + def test_slice_month(self): + dti = DatetimeIndex(freq='D', start=datetime(2005,1,1), periods=500) + s = Series(np.arange(len(dti)), index=dti) + self.assertEquals(len(s['2005-11']), 30) + + df = DataFrame(np.random.rand(len(dti), 5), index=dti) + self.assertEquals(len(df.ix['2005-11']), 30) + + def test_partial_slice(self): + rng = DatetimeIndex(freq='D', start=datetime(2005,1,1), periods=500) + s = Series(np.arange(len(rng)), index=rng) + + result = s['2005-05':'2006-02'] + expected = s['20050501':'20060228'] + assert_series_equal(result, expected) + + result = s['2005-05':] + expected = s['20050501':] + assert_series_equal(result, expected) + + result = s[:'2006-02'] + expected = s[:'20060228'] + assert_series_equal(result, expected) + + def test_partial_not_monotonic(self): + rng = date_range(datetime(2005,1,1), periods=20, freq='M') + ts = Series(np.arange(len(rng)), index=rng) + ts = ts.take(np.random.permutation(20)) + + self.assertRaises(Exception, ts.__getitem__, '2005') + + def test_date_range_normalize(self): + snap = datetime.today() + n = 50 + + rng = date_range(snap, periods=n, normalize=False, freq='2D') + + offset = timedelta(2) + values = np.array([snap + i * offset for i in range(n)], + dtype='M8[ns]') + + self.assert_(np.array_equal(rng, values)) + + rng = date_range('1/1/2000 08:15', periods=n, normalize=False, freq='B') + the_time = time(8, 15) + for val in rng: + self.assert_(val.time() == the_time) + + def test_timedelta(self): + # this is valid too + index = date_range('1/1/2000', periods=50, freq='B') + shifted = index + timedelta(1) + back = shifted + timedelta(-1) + self.assert_(tm.equalContents(index, back)) + self.assertEqual(shifted.freq, index.freq) + self.assertEqual(shifted.freq, back.freq) + + result = index - timedelta(1) + expected = index + timedelta(-1) + self.assert_(result.equals(expected)) + + def test_shift(self): + ts = Series(np.random.randn(5), + index=date_range('1/1/2000', periods=5, freq='H')) + + result = ts.shift(1, freq='5T') + exp_index = ts.index.shift(1, freq='5T') + self.assert_(result.index.equals(exp_index)) + + # GH #1063, multiple of same base + result = ts.shift(1, freq='4H') + exp_index = ts.index + datetools.Hour(4) + self.assert_(result.index.equals(exp_index)) + + idx = DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-04']) + self.assertRaises(ValueError, idx.shift, 1) + + def test_setops_preserve_freq(self): + rng = date_range('1/1/2000', '1/1/2002') + + result = rng[:50].union(rng[50:100]) + self.assert_(result.freq == rng.freq) + + result = rng[:50].union(rng[30:100]) + self.assert_(result.freq == rng.freq) + + result = rng[:50].union(rng[60:100]) + self.assert_(result.freq is None) + + result = rng[:50].intersection(rng[25:75]) + self.assert_(result.freqstr == 'D') + + nofreq = DatetimeIndex(list(rng[25:75])) + result = rng[:50].union(nofreq) + self.assert_(result.freq == rng.freq) + + result = rng[:50].intersection(nofreq) + self.assert_(result.freq == rng.freq) + + +class TestLegacyCompat(unittest.TestCase): + + def setUp(self): + from StringIO import StringIO + # suppress deprecation warnings + sys.stderr = StringIO() + + def test_inferTimeRule(self): + from pandas.tseries.frequencies import inferTimeRule + + index1 = [datetime(2010, 1, 29, 0, 0), + datetime(2010, 2, 26, 0, 0), + datetime(2010, 3, 31, 0, 0)] + + index2 = [datetime(2010, 3, 26, 0, 0), + datetime(2010, 3, 29, 0, 0), + datetime(2010, 3, 30, 0, 0)] + + index3 = [datetime(2010, 3, 26, 0, 0), + datetime(2010, 3, 27, 0, 0), + datetime(2010, 3, 29, 0, 0)] + + # LEGACY + assert inferTimeRule(index1) == 'EOM' + assert inferTimeRule(index2) == 'WEEKDAY' + + self.assertRaises(Exception, inferTimeRule, index1[:2]) + self.assertRaises(Exception, inferTimeRule, index3) + + def test_time_rule(self): + result = DateRange('1/1/2000', '1/30/2000', time_rule='WEEKDAY') + result2 = DateRange('1/1/2000', '1/30/2000', timeRule='WEEKDAY') + expected = date_range('1/1/2000', '1/30/2000', freq='B') + + self.assert_(result.equals(expected)) + self.assert_(result2.equals(expected)) + + def tearDown(self): + sys.stderr = sys.__stderr__ + + +class TestDatetime64(unittest.TestCase): + """ + Also test supoprt for datetime64[ns] in Series / DataFrame + """ + + + def setUp(self): + dti = DatetimeIndex(start=datetime(2005,1,1), + end=datetime(2005,1,10), freq='Min') + self.series = Series(rand(len(dti)), dti) + + def test_datetimeindex_accessors(self): + dti = DatetimeIndex(freq='Q-JAN', start=datetime(1997,12,31), periods=100) + + self.assertEquals(dti.year[0], 1998) + self.assertEquals(dti.month[0], 1) + self.assertEquals(dti.day[0], 31) + self.assertEquals(dti.hour[0], 0) + self.assertEquals(dti.minute[0], 0) + self.assertEquals(dti.second[0], 0) + self.assertEquals(dti.microsecond[0], 0) + self.assertEquals(dti.dayofweek[0], 5) + + self.assertEquals(dti.dayofyear[0], 31) + self.assertEquals(dti.dayofyear[1], 120) + + self.assertEquals(dti.weekofyear[0], 5) + self.assertEquals(dti.weekofyear[1], 18) + + self.assertEquals(dti.quarter[0], 1) + self.assertEquals(dti.quarter[1], 2) + + self.assertEquals(len(dti.year), 100) + self.assertEquals(len(dti.month), 100) + self.assertEquals(len(dti.day), 100) + self.assertEquals(len(dti.hour), 100) + self.assertEquals(len(dti.minute), 100) + self.assertEquals(len(dti.second), 100) + self.assertEquals(len(dti.microsecond), 100) + self.assertEquals(len(dti.dayofweek), 100) + self.assertEquals(len(dti.dayofyear), 100) + self.assertEquals(len(dti.weekofyear), 100) + self.assertEquals(len(dti.quarter), 100) + + def test_nanosecond_field(self): + dti = DatetimeIndex(np.arange(10)) + + self.assert_(np.array_equal(dti.nanosecond, np.arange(10))) + + def test_datetimeindex_diff(self): + dti1 = DatetimeIndex(freq='Q-JAN', start=datetime(1997,12,31), + periods=100) + dti2 = DatetimeIndex(freq='Q-JAN', start=datetime(1997,12,31), + periods=98) + self.assert_( len(dti1.diff(dti2)) == 2) + + def test_fancy_getitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005,1,1), + end=datetime(2010,1,1)) + + s = Series(np.arange(len(dti)), index=dti) + + self.assertEquals(s[48], 48) + self.assertEquals(s['1/2/2009'], 48) + self.assertEquals(s['2009-1-2'], 48) + self.assertEquals(s[datetime(2009,1,2)], 48) + self.assertEquals(s[lib.Timestamp(datetime(2009,1,2))], 48) + self.assertRaises(KeyError, s.__getitem__, '2009-1-3') + + assert_series_equal(s['3/6/2009':'2009-06-05'], + s[datetime(2009,3,6):datetime(2009,6,5)]) + + def test_fancy_setitem(self): + dti = DatetimeIndex(freq='WOM-1FRI', start=datetime(2005,1,1), + end=datetime(2010,1,1)) + + s = Series(np.arange(len(dti)), index=dti) + s[48] = -1 + self.assertEquals(s[48], -1) + s['1/2/2009'] = -2 + self.assertEquals(s[48], -2) + s['1/2/2009':'2009-06-05'] = -3 + self.assert_((s[48:54] == -3).all()) + + def test_datetimeindex_constructor(self): + arr = ['1/1/2005', '1/2/2005', 'Jn 3, 2005', '2005-01-04'] + self.assertRaises(Exception, DatetimeIndex, arr) + + arr = ['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04'] + idx1 = DatetimeIndex(arr) + + arr = [datetime(2005,1,1), '1/2/2005', '1/3/2005', '2005-01-04'] + idx2 = DatetimeIndex(arr) + + arr = [lib.Timestamp(datetime(2005,1,1)), '1/2/2005', '1/3/2005', + '2005-01-04'] + idx3 = DatetimeIndex(arr) + + arr = np.array(['1/1/2005', '1/2/2005', '1/3/2005', + '2005-01-04'], dtype='O') + idx4 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', '1/3/2005', '2005-01-04']) + idx5 = DatetimeIndex(arr) + + arr = to_datetime(['1/1/2005', '1/2/2005', 'Jan 3, 2005', '2005-01-04']) + idx6 = DatetimeIndex(arr) + + for other in [idx2, idx3, idx4, idx5, idx6]: + self.assert_( (idx1.values == other.values).all() ) + + sdate = datetime(1999, 12, 25) + edate = datetime(2000, 1, 1) + idx = DatetimeIndex(start=sdate, freq='1B', periods=20) + self.assertEquals(len(idx), 20) + self.assertEquals(idx[0], sdate + 0 * dt.bday) + self.assertEquals(idx.freq, 'B') + + idx = DatetimeIndex(end=edate, freq=('D', 5), periods=20) + self.assertEquals(len(idx), 20) + self.assertEquals(idx[-1], edate) + self.assertEquals(idx.freq, '5D') + + idx1 = DatetimeIndex(start=sdate, end=edate, freq='W-SUN') + idx2 = DatetimeIndex(start=sdate, end=edate, + freq=dt.Week(weekday=6)) + self.assertEquals(len(idx1), len(idx2)) + self.assertEquals(idx1.offset, idx2.offset) + + def test_dti_snap(self): + dti = DatetimeIndex(['1/1/2002', '1/2/2002', '1/3/2002', '1/4/2002', + '1/5/2002', '1/6/2002', '1/7/2002'], freq='D') + + res = dti.snap(freq='W-MON') + exp = date_range('12/31/2001', '1/7/2002', freq='w-mon') + exp = exp.repeat([3, 4]) + self.assert_( (res == exp).all() ) + + res = dti.snap(freq='B') + + exp = date_range('1/1/2002', '1/7/2002', freq='b') + exp = exp.repeat([1, 1, 1, 2, 2]) + self.assert_( (res == exp).all() ) + + def test_dti_reset_index_round_trip(self): + dti = DatetimeIndex(start='1/1/2001', end='6/1/2001', freq='D') + d1 = DataFrame({'v' : np.random.rand(len(dti))}, index=dti) + d2 = d1.reset_index() + self.assert_(d2.dtypes[0] == np.dtype('M8[ns]')) + d3 = d2.set_index('index') + assert_frame_equal(d1, d3) + + def test_datetimeindex_union_join_empty(self): + dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') + empty = Index([]) + + result = dti.union(empty) + self.assert_(isinstance(result, DatetimeIndex)) + self.assert_(result is result) + + result = dti.join(empty) + self.assert_(isinstance(result, DatetimeIndex)) + + # TODO: test merge & concat with datetime64 block + + +class TestSeriesDatetime64(unittest.TestCase): + + def setUp(self): + self.series = Series(date_range('1/1/2000', periods=10)) + + def test_auto_conversion(self): + series = Series(list(date_range('1/1/2000', periods=10))) + self.assert_(series.dtype == object) + + def test_constructor_cant_cast_datetime64(self): + self.assertRaises(TypeError, Series, + date_range('1/1/2000', periods=10), dtype=float) + + def test_series_comparison_scalars(self): + val = datetime(2000, 1, 4) + result = self.series > val + expected = np.array([x > val for x in self.series]) + self.assert_(np.array_equal(result, expected)) + + val = self.series[5] + result = self.series > val + expected = np.array([x > val for x in self.series]) + self.assert_(np.array_equal(result, expected)) + + def test_between(self): + left, right = self.series[[2, 7]] + + result = self.series.between(left, right) + expected = (self.series >= left) & (self.series <= right) + assert_series_equal(result, expected) + + #---------------------------------------------------------------------- + # NaT support + + def test_NaT_scalar(self): + series = Series([0, 1000, 2000, iNaT], dtype='M8[ns]') + + val = series[3] + self.assert_(com.isnull(val)) + + series[2] = val + self.assert_(com.isnull(series[2])) + + def test_set_none_nan(self): + self.series[3] = None + self.assert_(self.series[3] is lib.NaT) + + self.series[3:5] = None + self.assert_(self.series[4] is lib.NaT) + + self.series[5] = np.nan + self.assert_(self.series[5] is lib.NaT) + + self.series[5:7] = np.nan + self.assert_(self.series[6] is lib.NaT) + + def test_intercept_astype_object(self): + # Work around NumPy 1.6 bugs + result = self.series.astype(object) + result2 = self.series.astype('O') + expected = Series([x for x in self.series], dtype=object) + + assert_series_equal(result, expected) + assert_series_equal(result2, expected) + + df = DataFrame({'a': self.series, + 'b' : np.random.randn(len(self.series))}) + + result = df.values.squeeze() + self.assert_((result[:, 0] == expected.values).all()) + + df = DataFrame({'a': self.series, + 'b' : ['foo'] * len(self.series)}) + + result = df.values.squeeze() + self.assert_((result[:, 0] == expected.values).all()) + + def test_union(self): + rng1 = date_range('1/1/1999', '1/1/2012', freq='MS') + s1 = Series(np.random.randn(len(rng1)), rng1) + + rng2 = date_range('1/1/1980', '12/1/2001', freq='MS') + s2 = Series(np.random.randn(len(rng2)), rng2) + df = DataFrame({'s1' : s1, 's2' : s2}) + self.assert_(df.index.values.dtype == np.dtype('M8[ns]')) + + +class TestTimestamp(unittest.TestCase): + + def test_basics_nanos(self): + val = np.int64(946684800000000000).view('M8[ns]') + stamp = Timestamp(val.view('i8') + 500) + self.assert_(stamp.year == 2000) + self.assert_(stamp.month == 1) + self.assert_(stamp.microsecond == 0) + self.assert_(stamp.nanosecond == 500) + + def test_comparison(self): + # 5-18-2012 00:00:00.000 + stamp = 1337299200000000000L + + val = Timestamp(stamp) + + self.assert_(val == val) + self.assert_(not val != val) + self.assert_(not val < val) + self.assert_(val <= val) + self.assert_(not val > val) + self.assert_(val >= val) + + other = datetime(2012, 5, 18) + self.assert_(val == other) + self.assert_(not val != other) + self.assert_(not val < other) + self.assert_(val <= other) + self.assert_(not val > other) + self.assert_(val >= other) + + other = Timestamp(stamp + 100) + + self.assert_(not val == other) + self.assert_(val != other) + self.assert_(val < other) + self.assert_(val <= other) + self.assert_(other > val) + self.assert_(other >= val) + + def test_cant_compare_tz_naive_w_aware(self): + _skip_if_no_pytz() + # #1404 + a = Timestamp('3/12/2012') + b = Timestamp('3/12/2012', tz='utc') + + self.assertRaises(Exception, a.__eq__, b) + self.assertRaises(Exception, a.__ne__, b) + self.assertRaises(Exception, a.__lt__, b) + self.assertRaises(Exception, a.__gt__, b) + self.assertRaises(Exception, b.__eq__, a) + self.assertRaises(Exception, b.__ne__, a) + self.assertRaises(Exception, b.__lt__, a) + self.assertRaises(Exception, b.__gt__, a) + + self.assertRaises(Exception, a.__eq__, b.to_pydatetime()) + self.assertRaises(Exception, a.to_pydatetime().__eq__, b) + + def test_delta_preserve_nanos(self): + val = Timestamp(1337299200000000123L) + result = val + timedelta(1) + self.assert_(result.nanosecond == val.nanosecond) + + def test_frequency_misc(self): + self.assertEquals(fmod.get_freq_group('T'), + fmod.FreqGroup.FR_MIN) + + code, stride = fmod.get_freq_code(offsets.Hour()) + self.assertEquals(code, fmod.FreqGroup.FR_HR) + + code, stride = fmod.get_freq_code((5, 'T')) + self.assertEquals(code, fmod.FreqGroup.FR_MIN) + self.assertEquals(stride, 5) + + offset = offsets.Hour() + result = fmod.to_offset(offset) + self.assertEquals(result, offset) + + result = fmod.to_offset((5, 'T')) + expected = offsets.Minute(5) + self.assertEquals(result, expected) + + self.assertRaises(KeyError, fmod.get_freq_code, (5, 'baz')) + + self.assertRaises(ValueError, fmod.to_offset, '100foo') + + self.assertRaises(ValueError, fmod.to_offset, ('', '')) + + result = fmod.get_standard_freq(offsets.Hour()) + self.assertEquals(result, 'H') + + def test_hash_equivalent(self): + d = {datetime(2011, 1, 1) : 5} + stamp = Timestamp(datetime(2011, 1, 1)) + self.assertEquals(d[stamp], 5) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_timezones.py b/pandas/tseries/tests/test_timezones.py new file mode 100644 index 00000000..dad235ed --- /dev/null +++ b/pandas/tseries/tests/test_timezones.py @@ -0,0 +1,508 @@ +# pylint: disable-msg=E1101,W0612 +from __future__ import with_statement # for Python 2.5 +from datetime import datetime, time, timedelta +import sys +import os +import unittest + +import nose + +import numpy as np + +from pandas import (Index, Series, TimeSeries, DataFrame, isnull, + date_range, Timestamp) + +from pandas import DatetimeIndex, Int64Index, to_datetime + +from pandas.core.daterange import DateRange +import pandas.core.datetools as datetools +import pandas.tseries.offsets as offsets +from pandas.tseries.index import bdate_range, date_range +import pandas.tseries.tools as tools + +from pandas.util.testing import assert_series_equal, assert_almost_equal +import pandas.util.testing as tm + +import pandas.lib as lib +import cPickle as pickle +import pandas.core.datetools as dt +from numpy.random import rand +from pandas.util.testing import assert_frame_equal +import pandas.util.py3compat as py3compat +from pandas.core.datetools import BDay +import pandas.core.common as com + + +def _skip_if_no_pytz(): + try: + import pytz + except ImportError: + raise nose.SkipTest + +try: + import pytz +except ImportError: + pass + + +class TestTimeZoneSupport(unittest.TestCase): + + def setUp(self): + _skip_if_no_pytz() + + def test_utc_to_local_no_modify(self): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert('US/Eastern') + + # Values are unmodified + self.assert_(np.array_equal(rng.asi8, rng_eastern.asi8)) + + self.assert_(rng_eastern.tz == pytz.timezone('US/Eastern')) + + def test_localize_utc_conversion(self): + # Localizing to time zone should: + # 1) check for DST ambiguities + # 2) convert to UTC + + rng = date_range('3/10/2012', '3/11/2012', freq='30T') + + converted = rng.tz_localize('US/Eastern') + expected_naive = rng + offsets.Hour(5) + self.assert_(np.array_equal(converted.asi8, expected_naive.asi8)) + + # DST ambiguity, this should fail + rng = date_range('3/11/2012', '3/12/2012', freq='30T') + self.assertRaises(Exception, rng.tz_localize, 'US/Eastern') + + def test_timestamp_tz_localize(self): + stamp = Timestamp('3/11/2012 04:00') + + result = stamp.tz_localize('US/Eastern') + expected = Timestamp('3/11/2012 04:00', tz='US/Eastern') + self.assertEquals(result.hour, expected.hour) + self.assertEquals(result, expected) + + def test_timedelta_push_over_dst_boundary(self): + # #1389 + + # 4 hours before DST transition + stamp = Timestamp('3/10/2012 22:00', tz='US/Eastern') + + result = stamp + timedelta(hours=6) + + # spring forward, + "7" hours + expected = Timestamp('3/11/2012 05:00', tz='US/Eastern') + + self.assertEquals(result, expected) + + def test_tz_localize_dti(self): + from pandas.tseries.offsets import Hour + + dti = DatetimeIndex(start='1/1/2005', end='1/1/2005 0:00:30.256', + freq='L') + dti2 = dti.tz_localize('US/Eastern') + + dti_utc = DatetimeIndex(start='1/1/2005 05:00', + end='1/1/2005 5:00:30.256', freq='L', + tz='utc') + + self.assert_(np.array_equal(dti2.values, dti_utc.values)) + + dti3 = dti2.tz_convert('US/Pacific') + self.assert_(np.array_equal(dti3.values, dti_utc.values)) + + dti = DatetimeIndex(start='11/6/2011 1:59', + end='11/6/2011 2:00', freq='L') + self.assertRaises(pytz.AmbiguousTimeError, dti.tz_localize, + 'US/Eastern') + + dti = DatetimeIndex(start='3/13/2011 1:59', end='3/13/2011 2:00', + freq='L') + self.assertRaises(pytz.NonExistentTimeError, dti.tz_localize, 'US/Eastern') + + def test_create_with_tz(self): + stamp = Timestamp('3/11/2012 05:00', tz='US/Eastern') + self.assertEquals(stamp.hour, 5) + + rng = date_range('3/11/2012 04:00', periods=10, freq='H', tz='US/Eastern') + + self.assertEquals(stamp, rng[1]) + + utc_stamp = Timestamp('3/11/2012 05:00', tz='utc') + self.assert_(utc_stamp.tzinfo is pytz.utc) + self.assertEquals(utc_stamp.hour, 5) + + stamp = Timestamp('3/11/2012 05:00').tz_localize('utc') + self.assertEquals(utc_stamp.hour, 5) + + def test_date_range_localize(self): + rng = date_range('3/11/2012 03:00', periods=15, freq='H', tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 03:00', '3/11/2012 04:00'], + tz='US/Eastern') + rng3 = date_range('3/11/2012 03:00', periods=15, freq='H') + rng3 = rng3.tz_localize('US/Eastern') + + self.assert_(rng.equals(rng3)) + + # DST transition time + val = rng[0] + exp = Timestamp('3/11/2012 03:00', tz='US/Eastern') + + self.assertEquals(val.hour, 3) + self.assertEquals(exp.hour, 3) + self.assertEquals(val, exp) # same UTC value + self.assert_(rng[:2].equals(rng2)) + + # Right before the DST transition + rng = date_range('3/11/2012 00:00', periods=2, freq='H', tz='US/Eastern') + rng2 = DatetimeIndex(['3/11/2012 00:00', '3/11/2012 01:00'], + tz='US/Eastern') + self.assert_(rng.equals(rng2)) + exp = Timestamp('3/11/2012 00:00', tz='US/Eastern') + self.assertEquals(exp.hour, 0) + self.assertEquals(rng[0], exp) + exp = Timestamp('3/11/2012 01:00', tz='US/Eastern') + self.assertEquals(exp.hour, 1) + self.assertEquals(rng[1], exp) + + self.assertRaises(pytz.NonExistentTimeError, date_range, + '3/11/2012 00:00', periods=10, freq='H', tz='US/Eastern') + + def test_utc_box_timestamp_and_localize(self): + rng = date_range('3/11/2012', '3/12/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert('US/Eastern') + + tz = pytz.timezone('US/Eastern') + expected = tz.normalize(rng[-1]) + + stamp = rng_eastern[-1] + self.assertEquals(stamp, expected) + self.assertEquals(stamp.tzinfo, expected.tzinfo) + + # right tzinfo + rng = date_range('3/13/2012', '3/14/2012', freq='H', tz='utc') + rng_eastern = rng.tz_convert('US/Eastern') + self.assert_('EDT' in repr(rng_eastern[0].tzinfo)) + + def test_timestamp_tz_convert(self): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + idx = DatetimeIndex(strdates, tz='US/Eastern') + + conv = idx[0].tz_convert('US/Pacific') + expected = idx.tz_convert('US/Pacific')[0] + + self.assertEquals(conv, expected) + + def test_pass_dates_localize_to_utc(self): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + + idx = DatetimeIndex(strdates) + conv = idx.tz_localize('US/Eastern') + + fromdates = DatetimeIndex(strdates, tz='US/Eastern') + + self.assert_(conv.tz == fromdates.tz) + self.assert_(np.array_equal(conv.values, fromdates.values)) + + def test_field_access_localize(self): + strdates = ['1/1/2012', '3/1/2012', '4/1/2012'] + rng = DatetimeIndex(strdates, tz='US/Eastern') + self.assert_((rng.hour == 0).all()) + + def test_with_tz(self): + tz = pytz.timezone('US/Central') + + # just want it to work + start = datetime(2011, 3, 12, tzinfo=pytz.utc) + dr = bdate_range(start, periods=50, freq=datetools.Hour()) + self.assert_(dr.tz is pytz.utc) + + # DateRange with naive datetimes + dr = bdate_range('1/1/2005', '1/1/2009', tz=pytz.utc) + dr = bdate_range('1/1/2005', '1/1/2009', tz=tz) + + # normalized + central = dr.tz_convert(tz) + self.assert_(central.tz is tz) + self.assert_(central[0].tz is tz) + + # datetimes with tzinfo set + dr = bdate_range(datetime(2005, 1, 1, tzinfo=pytz.utc), + '1/1/2009', tz=pytz.utc) + + self.assertRaises(Exception, bdate_range, + datetime(2005, 1, 1, tzinfo=pytz.utc), + '1/1/2009', tz=tz) + + def test_tz_localize(self): + dr = bdate_range('1/1/2009', '1/1/2010') + dr_utc = bdate_range('1/1/2009', '1/1/2010', tz=pytz.utc) + localized = dr.tz_localize(pytz.utc) + self.assert_(np.array_equal(dr_utc, localized)) + + def test_with_tz_ambiguous_times(self): + tz = pytz.timezone('US/Eastern') + + rng = bdate_range(datetime(2009, 1, 1), datetime(2010, 1, 1)) + + # March 13, 2011, spring forward, skip from 2 AM to 3 AM + dr = date_range(datetime(2011, 3, 13, 1, 30), periods=3, + freq=datetools.Hour()) + self.assertRaises(pytz.NonExistentTimeError, dr.tz_localize, tz) + + # after dst transition, it works + dr = date_range(datetime(2011, 3, 13, 3, 30), periods=3, + freq=datetools.Hour(), tz=tz) + + # November 6, 2011, fall back, repeat 2 AM hour + dr = date_range(datetime(2011, 11, 6, 1, 30), periods=3, + freq=datetools.Hour()) + self.assertRaises(pytz.AmbiguousTimeError, dr.tz_localize, tz) + + # UTC is OK + dr = date_range(datetime(2011, 3, 13), periods=48, + freq=datetools.Minute(30), tz=pytz.utc) + + # test utility methods + def test_infer_tz(self): + eastern = pytz.timezone('US/Eastern') + utc = pytz.utc + + _start = datetime(2001, 1, 1) + _end = datetime(2009, 1, 1) + + start = eastern.localize(_start) + end = eastern.localize(_end) + assert(tools._infer_tzinfo(start, end) is eastern) + assert(tools._infer_tzinfo(start, None) is eastern) + assert(tools._infer_tzinfo(None, end) is eastern) + + start = utc.localize(_start) + end = utc.localize(_end) + assert(tools._infer_tzinfo(start, end) is utc) + + end = eastern.localize(_end) + self.assertRaises(Exception, tools._infer_tzinfo, start, end) + self.assertRaises(Exception, tools._infer_tzinfo, end, start) + + def test_tz_string(self): + result = date_range('1/1/2000', periods=10, tz='US/Eastern') + expected = date_range('1/1/2000', periods=10, + tz=pytz.timezone('US/Eastern')) + + self.assert_(result.equals(expected)) + + def test_take_dont_lose_meta(self): + _skip_if_no_pytz() + rng = date_range('1/1/2000', periods=20, tz='US/Eastern') + + result = rng.take(range(5)) + self.assert_(result.tz == rng.tz) + self.assert_(result.freq == rng.freq) + + def test_index_with_timezone_repr(self): + rng = date_range('4/13/2010', '5/6/2010') + + rng_eastern = rng.tz_localize('US/Eastern') + + rng_repr = repr(rng) + self.assert_('2010-04-13 00:00:00' in rng_repr) + + def test_index_astype_asobject_tzinfos(self): + # #1345 + + # dates around a dst transition + rng = date_range('2/13/2010', '5/6/2010', tz='US/Eastern') + + objs = rng.asobject + for i, x in enumerate(objs): + exval = rng[i] + self.assertEquals(x, exval) + self.assertEquals(x.tzinfo, exval.tzinfo) + + objs = rng.astype(object) + for i, x in enumerate(objs): + exval = rng[i] + self.assertEquals(x, exval) + self.assertEquals(x.tzinfo, exval.tzinfo) + + def test_localized_at_time_between_time(self): + from datetime import time + + rng = date_range('4/16/2012', '5/1/2012', freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_local = ts.tz_localize('US/Eastern') + + result = ts_local.at_time(time(10, 0)) + expected = ts.at_time(time(10, 0)).tz_localize('US/Eastern') + assert_series_equal(result, expected) + self.assert_(result.index.tz.zone == 'US/Eastern') + + t1, t2 = time(10, 0), time(11, 0) + result = ts_local.between_time(t1, t2) + expected = ts.between_time(t1, t2).tz_localize('US/Eastern') + assert_series_equal(result, expected) + self.assert_(result.index.tz.zone == 'US/Eastern') + + +class TestTimeZones(unittest.TestCase): + + def setUp(self): + _skip_if_no_pytz() + + def test_index_equals_with_tz(self): + left = date_range('1/1/2011', periods=100, freq='H', tz='utc') + right = date_range('1/1/2011', periods=100, freq='H', + tz='US/Eastern') + + self.assert_(not left.equals(right)) + + def test_tz_localize_naive(self): + rng = date_range('1/1/2011', periods=100, freq='H') + + conv = rng.tz_localize('US/Pacific') + exp = date_range('1/1/2011', periods=100, freq='H', tz='US/Pacific') + + self.assert_(conv.equals(exp)) + + def test_series_frame_tz_localize(self): + + rng = date_range('1/1/2011', periods=100, freq='H') + ts = Series(1, index=rng) + + result = ts.tz_localize('utc') + self.assert_(result.index.tz.zone == 'UTC') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_localize('utc') + expected = DataFrame({'a': 1}, rng.tz_localize('UTC')) + self.assert_(result.index.tz.zone == 'UTC') + assert_frame_equal(result, expected) + + df = df.T + result = df.tz_localize('utc', axis=1) + self.assert_(result.columns.tz.zone == 'UTC') + assert_frame_equal(result, expected.T) + + # Can't localize if already tz-aware + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + ts = Series(1, index=rng) + self.assertRaises(Exception, ts.tz_localize, 'US/Eastern') + + def test_series_frame_tz_convert(self): + rng = date_range('1/1/2011', periods=200, freq='D', + tz='US/Eastern') + ts = Series(1, index=rng) + + result = ts.tz_convert('Europe/Berlin') + self.assert_(result.index.tz.zone == 'Europe/Berlin') + + df = DataFrame({'a': 1}, index=rng) + result = df.tz_convert('Europe/Berlin') + expected = DataFrame({'a': 1}, rng.tz_convert('Europe/Berlin')) + self.assert_(result.index.tz.zone == 'Europe/Berlin') + assert_frame_equal(result, expected) + + df = df.T + result = df.tz_convert('Europe/Berlin', axis=1) + self.assert_(result.columns.tz.zone == 'Europe/Berlin') + assert_frame_equal(result, expected.T) + + # can't convert tz-naive + rng = date_range('1/1/2011', periods=200, freq='D') + ts = Series(1, index=rng) + self.assertRaises(Exception, ts.tz_convert, 'US/Eastern') + + def test_join_utc_convert(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng.tz_convert('US/Eastern') + right = rng.tz_convert('Europe/Berlin') + + for how in ['inner', 'outer', 'left', 'right']: + result = left.join(left[:-5], how=how) + self.assert_(isinstance(result, DatetimeIndex)) + self.assert_(result.tz == left.tz) + + result = left.join(right[:-5], how=how) + self.assert_(isinstance(result, DatetimeIndex)) + self.assert_(result.tz.zone == 'UTC') + + def test_join_naive_with_aware(self): + rng = date_range('1/1/2011', periods=10, freq='H') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_utc = ts.tz_localize('utc') + + self.assertRaises(Exception, ts.__add__, ts_utc) + self.assertRaises(Exception, ts_utc.__add__, ts) + + def test_equal_join_ensure_utc(self): + rng = date_range('1/1/2011', periods=10, freq='H', tz='US/Eastern') + ts = Series(np.random.randn(len(rng)), index=rng) + + ts_moscow = ts.tz_convert('Europe/Moscow') + + result = ts + ts_moscow + self.assert_(result.index.tz is pytz.utc) + + result = ts_moscow + ts + self.assert_(result.index.tz is pytz.utc) + + df = DataFrame({'a': ts}) + df_moscow = df.tz_convert('Europe/Moscow') + result = df + df_moscow + self.assert_(result.index.tz is pytz.utc) + + result = df_moscow + df + self.assert_(result.index.tz is pytz.utc) + + def test_arith_utc_convert(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + perm = np.random.permutation(100)[:90] + ts1 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('US/Eastern')) + + perm = np.random.permutation(100)[:90] + ts2 = Series(np.random.randn(90), + index=rng.take(perm).tz_convert('Europe/Berlin')) + + result = ts1 + ts2 + + uts1 = ts1.tz_convert('utc') + uts2 = ts2.tz_convert('utc') + expected = uts1 + uts2 + + self.assert_(result.index.tz == pytz.UTC) + assert_series_equal(result, expected) + + def test_intersection(self): + rng = date_range('1/1/2011', periods=100, freq='H', tz='utc') + + left = rng[10:90][::-1] + right = rng[20:80][::-1] + + self.assert_(left.tz == rng.tz) + result = left.intersection(right) + self.assert_(result.tz == left.tz) + + def test_timestamp_equality_different_timezones(self): + utc_range = date_range('1/1/2000', periods=20, tz='UTC') + + eastern_range = utc_range.tz_convert('US/Eastern') + berlin_range = utc_range.tz_convert('Europe/Berlin') + + for a, b, c in zip(utc_range, eastern_range, berlin_range): + self.assertEquals(a, b) + self.assertEquals(b, c) + self.assertEquals(a, c) + + self.assert_((utc_range == eastern_range).all()) + self.assert_((utc_range == berlin_range).all()) + self.assert_((berlin_range == eastern_range).all()) + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/tests/test_util.py b/pandas/tseries/tests/test_util.py new file mode 100644 index 00000000..02a98858 --- /dev/null +++ b/pandas/tseries/tests/test_util.py @@ -0,0 +1,64 @@ +import nose +import unittest + +import numpy as np + +from pandas import Series, date_range +import pandas.util.testing as tm + +from pandas.tseries.util import pivot_annual, isleapyear + +class TestPivotAnnual(unittest.TestCase): + """ + New pandas of scikits.timeseries pivot_annual + """ + def test_daily(self): + rng = date_range('1/1/2000', '12/31/2004', freq='D') + ts = Series(np.random.randn(len(rng)), index=rng) + + annual = pivot_annual(ts, 'D') + + doy = ts.index.dayofyear + doy[(-isleapyear(ts.index.year)) & (doy >= 60)] += 1 + + for i in range(1, 367): + subset = ts[doy == i] + subset.index = [x.year for x in subset.index] + + tm.assert_series_equal(annual[i].dropna(), subset) + + # check leap days + leaps = ts[(ts.index.month == 2) & (ts.index.day == 29)] + day = leaps.index.dayofyear[0] + leaps.index = leaps.index.year + tm.assert_series_equal(annual[day].dropna(), leaps) + + def test_weekly(self): + pass + + def test_monthly(self): + rng = date_range('1/1/2000', '12/31/2004', freq='M') + ts = Series(np.random.randn(len(rng)), index=rng) + + annual = pivot_annual(ts, 'M') + + month = ts.index.month + + for i in range(1, 13): + subset = ts[month == i] + subset.index = [x.year for x in subset.index] + tm.assert_series_equal(annual[i].dropna(), subset) + + def test_period_monthly(self): + pass + + def test_period_daily(self): + pass + + def test_period_weekly(self): + pass + +if __name__ == '__main__': + nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + exit=False) + diff --git a/pandas/tseries/tools.py b/pandas/tseries/tools.py new file mode 100644 index 00000000..9212bb44 --- /dev/null +++ b/pandas/tseries/tools.py @@ -0,0 +1,259 @@ +from datetime import datetime, timedelta +import re +import sys + +import numpy as np + +import pandas.lib as lib +import pandas.core.common as com + +try: + import dateutil + from dateutil.parser import parser + from dateutil.relativedelta import relativedelta + + # raise exception if dateutil 2.0 install on 2.x platform + if (sys.version_info[0] == 2 and + dateutil.__version__ == '2.0'): # pragma: no cover + raise Exception('dateutil 2.0 incompatible with Python 2.x, you must ' + 'install version 1.5!') +except ImportError: # pragma: no cover + print 'Please install python-dateutil via easy_install or some method!' + raise # otherwise a 2nd import won't show the message + + +def _infer_tzinfo(start, end): + def _infer(a, b): + tz = a.tzinfo + if b and b.tzinfo: + assert(tz == b.tzinfo) + return tz + tz = None + if start is not None: + tz = _infer(start, end) + elif end is not None: + tz = _infer(end, start) + return tz + + +def _maybe_get_tz(tz): + if isinstance(tz, (str, unicode)): + import pytz + tz = pytz.timezone(tz) + return tz + +def _figure_out_timezone(start, end, tzinfo): + inferred_tz = _infer_tzinfo(start, end) + tzinfo = _maybe_get_tz(tzinfo) + + tz = inferred_tz + if inferred_tz is None and tzinfo is not None: + tz = tzinfo + elif tzinfo is not None: + assert(inferred_tz == tzinfo) + # make tz naive for now + + # tz = _maybe_get_tz(tz) + + start = start if start is None else start.replace(tzinfo=None) + end = end if end is None else end.replace(tzinfo=None) + + return start, end, tz + + +def to_datetime(arg, errors='ignore', dayfirst=False, box=True): + """ + Convert argument to datetime + + Parameters + ---------- + arg : string, datetime, array of strings (with possible NAs) + errors : {'ignore', 'raise'}, default 'ignore' + Errors are ignored by default (values left untouched) + + Returns + ------- + ret : datetime if parsing succeeded + """ + from pandas.core.series import Series + from pandas.tseries.index import DatetimeIndex + if arg is None: + return arg + elif isinstance(arg, datetime): + return arg + elif isinstance(arg, Series): + values = lib.array_to_datetime(com._ensure_object(arg.values), + raise_=errors == 'raise', + dayfirst=dayfirst) + return Series(values, index=arg.index, name=arg.name) + elif isinstance(arg, (np.ndarray, list)): + if isinstance(arg, list): + arg = np.array(arg, dtype='O') + result = lib.array_to_datetime(com._ensure_object(arg), + raise_=errors == 'raise', + dayfirst=dayfirst) + if com.is_datetime64_dtype(result) and box: + result = DatetimeIndex(result) + return result + try: + if not arg: + return arg + return _dtparser.parse(arg, dayfirst=dayfirst) + except Exception: + if errors == 'raise': + raise + return arg + + +class DateParseError(ValueError): + pass + + +_dtparser = parser() + + +# patterns for quarters like '4Q2005', '05Q1' +qpat1full = re.compile(r'(\d)Q(\d\d\d\d)') +qpat2full = re.compile(r'(\d\d\d\d)Q(\d)') +qpat1 = re.compile(r'(\d)Q(\d\d)') +qpat2 = re.compile(r'(\d\d)Q(\d)') + + +def parse_time_string(arg, freq=None): + """ + Try hard to parse datetime string, leveraging dateutil plus some extra + goodies like quarter recognition. + + Parameters + ---------- + arg : basestring + freq : str or DateOffset, default None + Helps with interpreting time string if supplied + + Returns + ------- + datetime, datetime/dateutil.parser._result, str + """ + from pandas.core.format import print_config + from pandas.tseries.offsets import DateOffset + from pandas.tseries.frequencies import (_get_rule_month, _month_numbers, + _get_freq_str) + + if not isinstance(arg, basestring): + return arg + + arg = arg.upper() + try: + default = datetime(1,1,1).replace(hour=0, minute=0, + second=0, microsecond=0) + + # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 + if len(arg) in [4, 6]: + add_century = False + if len(arg) == 4: + add_century = True + qpats = [(qpat1, 1), (qpat2, 0)] + else: + qpats = [(qpat1full, 1), (qpat2full, 0)] + + for pat, yfirst in qpats: + qparse = pat.match(arg) + if qparse is not None: + if yfirst: + yi, qi = 1, 2 + else: + yi, qi = 2, 1 + q = int(qparse.group(yi)) + y_str = qparse.group(qi) + y = int(y_str) + if add_century: + y += 2000 + + if freq is not None: + # hack attack, #1228 + mnum = _month_numbers[_get_rule_month(freq)] + 1 + month = (mnum + (q - 1) * 3) % 12 + 1 + if month > mnum: + y -= 1 + else: + month = (q - 1) * 3 + 1 + + ret = default.replace(year=y, month=month) + return ret, ret, 'quarter' + + is_mo_str = freq is not None and freq == 'M' + is_mo_off = getattr(freq, 'rule_code', None) == 'M' + is_monthly = is_mo_str or is_mo_off + if len(arg) == 6 and is_monthly: + try: + ret = _try_parse_monthly(arg) + if ret is not None: + return ret, ret, 'month' + except Exception: + pass + + dayfirst = print_config.date_dayfirst + yearfirst = print_config.date_yearfirst + + parsed = _dtparser._parse(arg, dayfirst=dayfirst, yearfirst=yearfirst) + if parsed is None: + raise DateParseError("Could not parse %s" % arg) + + repl = {} + reso = 'year' + stopped = False + for attr in ["year", "month", "day", "hour", + "minute", "second", "microsecond"]: + can_be_zero = ['hour', 'minute', 'second', 'microsecond'] + value = getattr(parsed, attr) + if value is not None and value != 0: # or attr in can_be_zero): + repl[attr] = value + if not stopped: + reso = attr + else: + stopped = True + break + ret = default.replace(**repl) + return ret, parsed, reso # datetime, resolution + except Exception, e: + raise DateParseError(e) + +def _try_parse_monthly(arg): + base = 2000 + add_base = False + default = datetime(1, 1, 1).replace(hour=0, minute=0, second=0, + microsecond=0) + + if len(arg) == 4: + add_base = True + y = int(arg[:2]) + m = int(arg[2:4]) + elif len(arg) >= 6: # 201201 + y = int(arg[:4]) + m = int(arg[4:6]) + if add_base: + y += base + ret = default.replace(year=y, month=m) + return ret + +def normalize_date(dt): + return dt.replace(hour=0, minute=0, second=0, microsecond=0) + + +def format(dt): + """Returns date in YYYYMMDD format.""" + return dt.strftime('%Y%m%d') + +OLE_TIME_ZERO = datetime(1899, 12, 30, 0, 0, 0) + +def ole2datetime(oledt): + """function for converting excel date to normal date format""" + val = float(oledt) + + # Excel has a bug where it thinks the date 2/29/1900 exists + # we just reject any date before 3/1/1900. + if val < 61: + raise Exception("Value is outside of acceptable range: %s " % val) + + return OLE_TIME_ZERO + timedelta(days=val) + diff --git a/pandas/tseries/util.py b/pandas/tseries/util.py new file mode 100644 index 00000000..4b297712 --- /dev/null +++ b/pandas/tseries/util.py @@ -0,0 +1,85 @@ +import numpy as np + +from pandas.core.frame import DataFrame +import pandas.core.nanops as nanops + +def pivot_annual(series, freq=None): + """ + Group a series by years, taking leap years into account. + + The output has as many rows as distinct years in the original series, + and as many columns as the length of a leap year in the units corresponding + to the original frequency (366 for daily frequency, 366*24 for hourly...). + The fist column of the output corresponds to Jan. 1st, 00:00:00, + while the last column corresponds to Dec, 31st, 23:59:59. + Entries corresponding to Feb. 29th are masked for non-leap years. + + For example, if the initial series has a daily frequency, the 59th column + of the output always corresponds to Feb. 28th, the 61st column to Mar. 1st, + and the 60th column is masked for non-leap years. + With a hourly initial frequency, the (59*24)th column of the output always + correspond to Feb. 28th 23:00, the (61*24)th column to Mar. 1st, 00:00, and + the 24 columns between (59*24) and (61*24) are masked. + + If the original frequency is less than daily, the output is equivalent to + ``series.convert('A', func=None)``. + + Parameters + ---------- + series : TimeSeries + freq : string or None, default None + + Returns + ------- + annual : DataFrame + """ + index = series.index + year = index.year + years = nanops.unique1d(year) + + if freq is not None: + freq = freq.upper() + else: + freq = series.index.freq + + if freq == 'D': + width = 366 + offset = index.dayofyear - 1 + + # adjust for leap year + offset[(-isleapyear(year)) & (offset >= 59)] += 1 + + columns = range(1, 367) + # todo: strings like 1/1, 1/25, etc.? + elif freq in ('M', 'BM'): + width = 12 + offset = index.month - 1 + columns = range(1, 13) + else: + raise NotImplementedError(freq) + + flat_index = (year - years.min()) * width + offset + + values = np.empty((len(years), width), dtype=series.dtype) + + if not np.issubdtype(series.dtype, np.integer): + values.fill(np.nan) + else: + raise Exception('need to upcast') + + values.put(flat_index, series.values) + + return DataFrame(values, index=years, columns=columns) + +def isleapyear(year): + """ + Returns true if year is a leap year. + + Parameters + ---------- + year : integer / sequence + A given (list of) year(s). + """ + year = np.asarray(year) + return np.logical_or(year % 400 == 0, + np.logical_and(year % 4 == 0, year % 100 > 0)) diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandas/util/clipboard.py b/pandas/util/clipboard.py new file mode 100644 index 00000000..b2180001 --- /dev/null +++ b/pandas/util/clipboard.py @@ -0,0 +1,110 @@ +""" +Taken from the IPython project http://ipython.org + +Used under the terms of the BSD license +""" + +import subprocess +import sys + +def clipboard_get(): + """ Get text from the clipboard. + """ + if sys.platform == 'win32': + try: + return win32_clipboard_get() + except Exception: + pass + elif sys.platform == 'darwin': + try: + return osx_clipboard_get() + except Exception: + pass + return tkinter_clipboard_get() + +def clipboard_set(text): + """ Get text from the clipboard. + """ + if sys.platform == 'win32': + try: + return win32_clipboard_set(text) + except Exception: + raise + elif sys.platform == 'darwin': + try: + return osx_clipboard_set(text) + except Exception: + pass + xsel_clipboard_set(text) + +def win32_clipboard_get(): + """ Get the current clipboard's text on Windows. + + Requires Mark Hammond's pywin32 extensions. + """ + try: + import win32clipboard + except ImportError: + message = ("Getting text from the clipboard requires the pywin32 " + "extensions: http://sourceforge.net/projects/pywin32/") + raise Exception(message) + win32clipboard.OpenClipboard() + text = win32clipboard.GetClipboardData(win32clipboard.CF_TEXT) + # FIXME: convert \r\n to \n? + win32clipboard.CloseClipboard() + return text + +def osx_clipboard_get(): + """ Get the clipboard's text on OS X. + """ + p = subprocess.Popen(['pbpaste', '-Prefer', 'ascii'], + stdout=subprocess.PIPE) + text, stderr = p.communicate() + # Text comes in with old Mac \r line endings. Change them to \n. + text = text.replace('\r', '\n') + return text + +def tkinter_clipboard_get(): + """ Get the clipboard's text using Tkinter. + + This is the default on systems that are not Windows or OS X. It may + interfere with other UI toolkits and should be replaced with an + implementation that uses that toolkit. + """ + try: + import Tkinter + except ImportError: + message = ("Getting text from the clipboard on this platform " + "requires Tkinter.") + raise Exception(message) + root = Tkinter.Tk() + root.withdraw() + text = root.clipboard_get() + root.destroy() + return text + +def win32_clipboard_set(text): + # idiosyncratic win32 import issues + import pywintypes as _ + import win32clipboard + win32clipboard.OpenClipboard() + try: + win32clipboard.EmptyClipboard() + win32clipboard.SetClipboardText(_fix_line_endings(text)) + finally: + win32clipboard.CloseClipboard() + +def _fix_line_endings(text): + return '\r\n'.join(text.splitlines()) + +def osx_clipboard_set(text): + """ Get the clipboard's text on OS X. + """ + p = subprocess.Popen(['pbcopy', '-Prefer', 'ascii'], + stdin=subprocess.PIPE) + p.communicate(input=text) + +def xsel_clipboard_set(text): + from subprocess import Popen, PIPE + p = Popen(['xsel', '-bi'], stdin=PIPE) + p.communicate(input=text) diff --git a/pandas/util/compat.py b/pandas/util/compat.py new file mode 100644 index 00000000..213f0655 --- /dev/null +++ b/pandas/util/compat.py @@ -0,0 +1,14 @@ +# itertools.product not in Python 2.5 + +try: + from itertools import product +except ImportError: # python 2.5 + def product(*args, **kwds): + # product('ABCD', 'xy') --> Ax Ay Bx By Cx Cy Dx Dy + # product(range(2), repeat=3) --> 000 001 010 011 100 101 110 111 + pools = map(tuple, args) * kwds.get('repeat', 1) + result = [[]] + for pool in pools: + result = [x+[y] for x in result for y in pool] + for prod in result: + yield tuple(prod) diff --git a/pandas/util/counter.py b/pandas/util/counter.py new file mode 100644 index 00000000..f23f6e6f --- /dev/null +++ b/pandas/util/counter.py @@ -0,0 +1,290 @@ +# This is copied from collections in Python 2.7, for compatibility with older +# versions of Python. It can be dropped when we depend on Python 2.7/3.1 + +import heapq as _heapq +from itertools import repeat as _repeat, chain as _chain, starmap as _starmap +from operator import itemgetter as _itemgetter + +try: + from collections import Mapping +except: + # ABCs were only introduced in Python 2.6, so this is a hack for Python 2.5: + Mapping = dict + +class Counter(dict): + '''Dict subclass for counting hashable items. Sometimes called a bag + or multiset. Elements are stored as dictionary keys and their counts + are stored as dictionary values. + + >>> c = Counter('abcdeabcdabcaba') # count elements from a string + + >>> c.most_common(3) # three most common elements + [('a', 5), ('b', 4), ('c', 3)] + >>> sorted(c) # list all unique elements + ['a', 'b', 'c', 'd', 'e'] + >>> ''.join(sorted(c.elements())) # list elements with repetitions + 'aaaaabbbbcccdde' + >>> sum(c.values()) # total of all counts + 15 + + >>> c['a'] # count of letter 'a' + 5 + >>> for elem in 'shazam': # update counts from an iterable + ... c[elem] += 1 # by adding 1 to each element's count + >>> c['a'] # now there are seven 'a' + 7 + >>> del c['b'] # remove all 'b' + >>> c['b'] # now there are zero 'b' + 0 + + >>> d = Counter('simsalabim') # make another counter + >>> c.update(d) # add in the second counter + >>> c['a'] # now there are nine 'a' + 9 + + >>> c.clear() # empty the counter + >>> c + Counter() + + Note: If a count is set to zero or reduced to zero, it will remain + in the counter until the entry is deleted or the counter is cleared: + + >>> c = Counter('aaabbc') + >>> c['b'] -= 2 # reduce the count of 'b' by two + >>> c.most_common() # 'b' is still in, but its count is zero + [('a', 3), ('c', 1), ('b', 0)] + + ''' + # References: + # http://en.wikipedia.org/wiki/Multiset + # http://www.gnu.org/software/smalltalk/manual-base/html_node/Bag.html + # http://www.demo2s.com/Tutorial/Cpp/0380__set-multiset/Catalog0380__set-multiset.htm + # http://code.activestate.com/recipes/259174/ + # Knuth, TAOCP Vol. II section 4.6.3 + + def __init__(self, iterable=None, **kwds): + '''Create a new, empty Counter object. And if given, count elements + from an input iterable. Or, initialize the count from another mapping + of elements to their counts. + + >>> c = Counter() # a new, empty counter + >>> c = Counter('gallahad') # a new counter from an iterable + >>> c = Counter({'a': 4, 'b': 2}) # a new counter from a mapping + >>> c = Counter(a=4, b=2) # a new counter from keyword args + + ''' + super(Counter, self).__init__() + self.update(iterable, **kwds) + + def __missing__(self, key): + 'The count of elements not in the Counter is zero.' + # Needed so that self[missing_item] does not raise KeyError + return 0 + + def most_common(self, n=None): + '''List the n most common elements and their counts from the most + common to the least. If n is None, then list all element counts. + + >>> Counter('abcdeabcdabcaba').most_common(3) + [('a', 5), ('b', 4), ('c', 3)] + + ''' + # Emulate Bag.sortedByCount from Smalltalk + if n is None: + return sorted(self.iteritems(), key=_itemgetter(1), reverse=True) + return _heapq.nlargest(n, self.iteritems(), key=_itemgetter(1)) + + def elements(self): + '''Iterator over elements repeating each as many times as its count. + + >>> c = Counter('ABCABC') + >>> sorted(c.elements()) + ['A', 'A', 'B', 'B', 'C', 'C'] + + # Knuth's example for prime factors of 1836: 2**2 * 3**3 * 17**1 + >>> prime_factors = Counter({2: 2, 3: 3, 17: 1}) + >>> product = 1 + >>> for factor in prime_factors.elements(): # loop over factors + ... product *= factor # and multiply them + >>> product + 1836 + + Note, if an element's count has been set to zero or is a negative + number, elements() will ignore it. + + ''' + # Emulate Bag.do from Smalltalk and Multiset.begin from C++. + return _chain.from_iterable(_starmap(_repeat, self.iteritems())) + + # Override dict methods where necessary + + @classmethod + def fromkeys(cls, iterable, v=None): + # There is no equivalent method for counters because setting v=1 + # means that no element can have a count greater than one. + raise NotImplementedError( + 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') + + def update(self, iterable=None, **kwds): + '''Like dict.update() but add counts instead of replacing them. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.update('witch') # add elements from another iterable + >>> d = Counter('watch') + >>> c.update(d) # add elements from another counter + >>> c['h'] # four 'h' in which, witch, and watch + 4 + + ''' + # The regular dict.update() operation makes no sense here because the + # replace behavior results in the some of original untouched counts + # being mixed-in with all of the other counts for a mismash that + # doesn't have a straight-forward interpretation in most counting + # contexts. Instead, we implement straight-addition. Both the inputs + # and outputs are allowed to contain zero and negative counts. + + if iterable is not None: + if isinstance(iterable, Mapping): + if self: + self_get = self.get + for elem, count in iterable.iteritems(): + self[elem] = self_get(elem, 0) + count + else: + super(Counter, self).update(iterable) # fast path when counter is empty + else: + self_get = self.get + for elem in iterable: + self[elem] = self_get(elem, 0) + 1 + if kwds: + self.update(kwds) + + def subtract(self, iterable=None, **kwds): + '''Like dict.update() but subtracts counts instead of replacing them. + Counts can be reduced below zero. Both the inputs and outputs are + allowed to contain zero and negative counts. + + Source can be an iterable, a dictionary, or another Counter instance. + + >>> c = Counter('which') + >>> c.subtract('witch') # subtract elements from another iterable + >>> c.subtract(Counter('watch')) # subtract elements from another counter + >>> c['h'] # 2 in which, minus 1 in witch, minus 1 in watch + 0 + >>> c['w'] # 1 in which, minus 1 in witch, minus 1 in watch + -1 + + ''' + if iterable is not None: + self_get = self.get + if isinstance(iterable, Mapping): + for elem, count in iterable.items(): + self[elem] = self_get(elem, 0) - count + else: + for elem in iterable: + self[elem] = self_get(elem, 0) - 1 + if kwds: + self.subtract(kwds) + + def copy(self): + 'Return a shallow copy.' + return self.__class__(self) + + def __reduce__(self): + return self.__class__, (dict(self),) + + def __delitem__(self, elem): + 'Like dict.__delitem__() but does not raise KeyError for missing values.' + if elem in self: + super(Counter, self).__delitem__(elem) + + def __repr__(self): + if not self: + return '%s()' % self.__class__.__name__ + items = ', '.join(map('%r: %r'.__mod__, self.most_common())) + return '%s({%s})' % (self.__class__.__name__, items) + + # Multiset-style mathematical operations discussed in: + # Knuth TAOCP Volume II section 4.6.3 exercise 19 + # and at http://en.wikipedia.org/wiki/Multiset + # + # Outputs guaranteed to only include positive counts. + # + # To strip negative and zero counts, add-in an empty counter: + # c += Counter() + + def __add__(self, other): + '''Add counts from two counters. + + >>> Counter('abbb') + Counter('bcc') + Counter({'b': 4, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count + other[elem] + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count > 0: + result[elem] = count + return result + + def __sub__(self, other): + ''' Subtract count, but keep only results with positive counts. + + >>> Counter('abbbc') - Counter('bccd') + Counter({'b': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + newcount = count - other[elem] + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count < 0: + result[elem] = 0 - count + return result + + def __or__(self, other): + '''Union is the maximum of value in either of the input counters. + + >>> Counter('abbb') | Counter('bcc') + Counter({'b': 3, 'c': 2, 'a': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + other_count = other[elem] + newcount = other_count if count < other_count else count + if newcount > 0: + result[elem] = newcount + for elem, count in other.items(): + if elem not in self and count > 0: + result[elem] = count + return result + + def __and__(self, other): + ''' Intersection is the minimum of corresponding counts. + + >>> Counter('abbb') & Counter('bcc') + Counter({'b': 1}) + + ''' + if not isinstance(other, Counter): + return NotImplemented + result = Counter() + for elem, count in self.items(): + other_count = other[elem] + newcount = count if count < other_count else other_count + if newcount > 0: + result[elem] = newcount + return result diff --git a/pandas/util/decorators.py b/pandas/util/decorators.py new file mode 100644 index 00000000..5f535c77 --- /dev/null +++ b/pandas/util/decorators.py @@ -0,0 +1,171 @@ +from pandas.util.py3compat import StringIO +from pandas.lib import cache_readonly +import sys +import warnings + +def deprecate(name, alternative): + alt_name = alternative.func_name + def wrapper(*args, **kwargs): + warnings.warn("%s is deprecated. Use %s instead" % (name, alt_name), + FutureWarning) + return alternative(*args, **kwargs) + return wrapper + +# Substitution and Appender are derived from matplotlib.docstring (1.1.0) +# module http://matplotlib.sourceforge.net/users/license.html + +class Substitution(object): + """ + A decorator to take a function's docstring and perform string + substitution on it. + + This decorator should be robust even if func.__doc__ is None + (for example, if -OO was passed to the interpreter) + + Usage: construct a docstring.Substitution with a sequence or + dictionary suitable for performing substitution; then + decorate a suitable function with the constructed object. e.g. + + sub_author_name = Substitution(author='Jason') + + @sub_author_name + def some_function(x): + "%(author)s wrote this function" + + # note that some_function.__doc__ is now "Jason wrote this function" + + One can also use positional arguments. + + sub_first_last_names = Substitution('Edgar Allen', 'Poe') + + @sub_first_last_names + def some_function(x): + "%s %s wrote the Raven" + """ + def __init__(self, *args, **kwargs): + assert not (args and kwargs), "Only positional or keyword args are allowed" + self.params = args or kwargs + + def __call__(self, func): + func.__doc__ = func.__doc__ and func.__doc__ % self.params + return func + + def update(self, *args, **kwargs): + "Assume self.params is a dict and update it with supplied args" + self.params.update(*args, **kwargs) + + @classmethod + def from_params(cls, params): + """ + In the case where the params is a mutable sequence (list or dictionary) + and it may change before this class is called, one may explicitly use a + reference to the params rather than using *args or **kwargs which will + copy the values and not reference them. + """ + result = cls() + result.params = params + return result + +class Appender(object): + """ + A function decorator that will append an addendum to the docstring + of the target function. + + This decorator should be robust even if func.__doc__ is None + (for example, if -OO was passed to the interpreter). + + Usage: construct a docstring.Appender with a string to be joined to + the original docstring. An optional 'join' parameter may be supplied + which will be used to join the docstring and addendum. e.g. + + add_copyright = Appender("Copyright (c) 2009", join='\n') + + @add_copyright + def my_dog(has='fleas'): + "This docstring will have a copyright below" + pass + """ + def __init__(self, addendum, join='', indents=0): + if indents > 0: + self.addendum = indent(addendum, indents=indents) + else: + self.addendum = addendum + self.join = join + + def __call__(self, func): + docitems = [func.__doc__ if func.__doc__ else '', self.addendum] + func.__doc__ = ''.join(docitems) + return func + +def indent(text, indents=1): + if not text or type(text) != str: + return '' + jointext = ''.join(['\n'] + [' '] * indents) + return jointext.join(text.split('\n')) + +def suppress_stdout(f): + def wrapped(*args, **kwargs): + try: + sys.stdout = StringIO() + f(*args, **kwargs) + finally: + sys.stdout = sys.__stdout__ + + return wrapped + + +class KnownFailureTest(Exception): + '''Raise this exception to mark a test as a known failing test.''' + pass + +def knownfailureif(fail_condition, msg=None): + """ + Make function raise KnownFailureTest exception if given condition is true. + + If the condition is a callable, it is used at runtime to dynamically + make the decision. This is useful for tests that may require costly + imports, to delay the cost until the test suite is actually executed. + + Parameters + ---------- + fail_condition : bool or callable + Flag to determine whether to mark the decorated test as a known + failure (if True) or not (if False). + msg : str, optional + Message to give on raising a KnownFailureTest exception. + Default is None. + + Returns + ------- + decorator : function + Decorator, which, when applied to a function, causes SkipTest + to be raised when `skip_condition` is True, and the function + to be called normally otherwise. + + Notes + ----- + The decorator itself is decorated with the ``nose.tools.make_decorator`` + function in order to transmit function name, and various other metadata. + + """ + if msg is None: + msg = 'Test skipped due to known failure' + + # Allow for both boolean or callable known failure conditions. + if callable(fail_condition): + fail_val = fail_condition + else: + fail_val = lambda: fail_condition + + def knownfail_decorator(f): + # Local import to avoid a hard nose dependency and only incur the + # import time overhead at actual test-time. + import nose + def knownfailer(*args, **kwargs): + if fail_val(): + raise KnownFailureTest, msg + else: + return f(*args, **kwargs) + return nose.tools.make_decorator(f)(knownfailer) + + return knownfail_decorator diff --git a/pandas/util/map.py b/pandas/util/map.py new file mode 100644 index 00000000..65ab1e97 --- /dev/null +++ b/pandas/util/map.py @@ -0,0 +1,69 @@ +import numpy as np +from pandas import _tseries as lib +from pandas import notnull, Series +from functools import wraps + +class repeat(object): + def __init__(self, obj): + self.obj = obj + + def __getitem__(self, i): + return self.obj + +class azip(object): + def __init__(self, *args): + self.cols = [] + for a in args: + if np.isscalar(a): + self.cols.append(repeat(a)) + else: + self.cols.append(a) + + def __getitem__(self, i): + return [col[i] for col in self.cols] + +def map_iter_args(arr, f, otherargs, n_otherargs, required, n_results): + ''' + Substitute for np.vectorize with pandas-friendly dtype inference + + Parameters + ---------- + arr : ndarray + f : function + + Returns + ------- + mapped : ndarray + ''' + n = len(arr) + result = np.empty((n, n_results), dtype=object) + for i, val in enumerate(arr): + args = otherargs[i] + if notnull(val) and all(notnull(args[r]) for r in required): + result[i] = f(val, *args) + else: + result[i] = [np.nan] * n_results + + return [lib.maybe_convert_objects(col, try_float=0) for col in result.T] + +def auto_map(arr, f, otherargs, n_results=1, required='all'): + if all(np.isscalar(a) for a in otherargs): + res = lib.map_infer(arr, lambda v: f(v, *otherargs)) + return Series(res, index=arr.index, copy=False) + + n_otherargs = len(otherargs) + if required == 'all': + required = list(range(n_otherargs)) + res = map_iter_args(arr, f, azip(*otherargs), n_otherargs, required, n_results) + res = [Series(col, index=arr.index, copy=False) for col in res] + if n_results == 1: + return res[0] + return res + +def mapwrap(f, n_results_default=1, required='all'): + @wraps(f) + def wrapped(arr, n_results=None, *otherargs): + n_results = n_results or n_results_default + return auto_map(arr, f, otherargs, n_results, required) + + return wrapped diff --git a/pandas/util/misc.py b/pandas/util/misc.py new file mode 100644 index 00000000..25edfb74 --- /dev/null +++ b/pandas/util/misc.py @@ -0,0 +1,4 @@ +def exclusive(*args): + count = sum([arg is not None for arg in args]) + return count == 1 + diff --git a/pandas/util/py3compat.py b/pandas/util/py3compat.py new file mode 100644 index 00000000..9a602155 --- /dev/null +++ b/pandas/util/py3compat.py @@ -0,0 +1,37 @@ +import sys + +PY3 = (sys.version_info[0] >= 3) + +if PY3: + def isidentifier(s): + return s.isidentifier() + + def str_to_bytes(s, encoding='ascii'): + return s.encode(encoding) + + def bytes_to_str(b, encoding='utf-8'): + return b.decode(encoding) + +else: + # Python 2 + import re + _name_re = re.compile(r"[a-zA-Z_][a-zA-Z0-9_]*$") + def isidentifier(s, dotted=False): + return bool(_name_re.match(s)) + + def str_to_bytes(s, encoding='ascii'): + return s + + def bytes_to_str(b, encoding='ascii'): + return b + +try: + from cStringIO import StringIO +except: + from io import StringIO + +try: + from io import BytesIO +except: + from cStringIO import StringIO as BytesIO + diff --git a/pandas/util/terminal.py b/pandas/util/terminal.py new file mode 100644 index 00000000..4278f35b --- /dev/null +++ b/pandas/util/terminal.py @@ -0,0 +1,108 @@ +""" +get_terminal_size() -- return width and height of console as a tuple + +code from: +http://stackoverflow.com/questions/566746/how-to-get-console- window-width-in- +python + +written by +Harco Kuppens (http://stackoverflow.com/users/825214/harco-kuppens) + +It is mentioned in the stackoverflow response that this code works +on linux, os x, windows and cygwin (windows). +""" + +import os + +__all__=['get_terminal_size'] + + +def get_terminal_size(): + import platform + current_os = platform.system() + tuple_xy=None + if current_os == 'Windows': + tuple_xy = _get_terminal_size_windows() + if tuple_xy is None: + tuple_xy = _get_terminal_size_tput() + # needed for window's python in cygwin's xterm! + if current_os == 'Linux' or \ + current_os == 'Darwin' or \ + current_os.startswith('CYGWIN'): + tuple_xy = _get_terminal_size_linux() + if tuple_xy is None: + tuple_xy = (80, 25) # default value + return tuple_xy + +def _get_terminal_size_windows(): + res=None + try: + from ctypes import windll, create_string_buffer + + # stdin handle is -10 + # stdout handle is -11 + # stderr handle is -12 + + h = windll.kernel32.GetStdHandle(-12) + csbi = create_string_buffer(22) + res = windll.kernel32.GetConsoleScreenBufferInfo(h, csbi) + except: + return None + if res: + import struct + (bufx, bufy, curx, cury, wattr, left, top, right, bottom, maxx, + maxy) = struct.unpack("hhhhHhhhhhh", csbi.raw) + sizex = right - left + 1 + sizey = bottom - top + 1 + return sizex, sizey + else: + return None + +def _get_terminal_size_tput(): + # get terminal width + # src: http://stackoverflow.com/questions/263890/how-do-i-find-the-width + # -height-of-a-terminal-window + try: + import subprocess + proc = subprocess.Popen(["tput", "cols"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + output=proc.communicate(input=None) + cols=int(output[0]) + proc=subprocess.Popen(["tput", "lines"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + output=proc.communicate(input=None) + rows=int(output[0]) + return (cols,rows) + except: + return None + + +def _get_terminal_size_linux(): + def ioctl_GWINSZ(fd): + try: + import fcntl, termios, struct, os + cr = struct.unpack('hh', fcntl.ioctl(fd, termios.TIOCGWINSZ,'1234')) + except: + return None + return cr + cr = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2) + if not cr: + try: + fd = os.open(os.ctermid(), os.O_RDONLY) + cr = ioctl_GWINSZ(fd) + os.close(fd) + except: + pass + if not cr or cr == (0, 0): + try: + from os import environ as env + cr = (env['LINES'], env['COLUMNS']) + except: + return None + return int(cr[1]), int(cr[0]) + +if __name__ == "__main__": + sizex, sizey = get_terminal_size() + print 'width =', sizex, 'height =', sizey diff --git a/pandas/util/testing.py b/pandas/util/testing.py new file mode 100644 index 00000000..01117f3e --- /dev/null +++ b/pandas/util/testing.py @@ -0,0 +1,380 @@ +from __future__ import division + +# pylint: disable-msg=W0402 + +from datetime import datetime +import random +import string +import sys + +from distutils.version import LooseVersion + +from numpy.random import randn +import numpy as np + +from pandas.core.common import isnull +import pandas.core.index as index +import pandas.core.series as series +import pandas.core.frame as frame +import pandas.core.panel as panel + +from pandas import bdate_range +from pandas.tseries.index import DatetimeIndex +from pandas.tseries.period import PeriodIndex +from pandas.tseries.interval import IntervalIndex + + +Index = index.Index +Series = series.Series +DataFrame = frame.DataFrame +Panel = panel.Panel + +N = 30 +K = 4 + +def rands(n): + choices = string.ascii_letters + string.digits + return ''.join([random.choice(choices) for _ in xrange(n)]) + +#------------------------------------------------------------------------------- +# Console debugging tools + +def debug(f, *args, **kwargs): + from pdb import Pdb as OldPdb + try: + from IPython.core.debugger import Pdb + kw = dict(color_scheme='Linux') + except ImportError: + Pdb = OldPdb + kw = {} + pdb = Pdb(**kw) + return pdb.runcall(f, *args, **kwargs) + +def set_trace(): + from IPython.core.debugger import Pdb + try: + Pdb(color_scheme='Linux').set_trace(sys._getframe().f_back) + except: + from pdb import Pdb as OldPdb + OldPdb().set_trace(sys._getframe().f_back) + +#------------------------------------------------------------------------------- +# Comparators + +def equalContents(arr1, arr2): + """Checks if the set of unique elements of arr1 and arr2 are equivalent. + """ + return frozenset(arr1) == frozenset(arr2) + +def isiterable(obj): + return hasattr(obj, '__iter__') + +def assert_almost_equal(a, b): + if isinstance(a, dict) or isinstance(b, dict): + return assert_dict_equal(a, b) + + if isinstance(a, basestring): + assert a == b, (a, b) + return True + + if isiterable(a): + np.testing.assert_(isiterable(b)) + np.testing.assert_equal(len(a), len(b)) + if np.array_equal(a, b): + return True + else: + for i in xrange(len(a)): + assert_almost_equal(a[i], b[i]) + return True + + err_msg = lambda a, b: 'expected %.5f but got %.5f' % (a, b) + + if isnull(a): + np.testing.assert_(isnull(b)) + return + + if isinstance(a, (bool, float, int)): + # case for zero + if abs(a) < 1e-5: + np.testing.assert_almost_equal( + a, b, decimal=5, err_msg=err_msg(a, b), verbose=False) + else: + np.testing.assert_almost_equal( + 1, a/b, decimal=5, err_msg=err_msg(a, b), verbose=False) + else: + assert(a == b) + +def is_sorted(seq): + return assert_almost_equal(seq, np.sort(np.array(seq))) + +def assert_dict_equal(a, b, compare_keys=True): + a_keys = frozenset(a.keys()) + b_keys = frozenset(b.keys()) + + if compare_keys: + assert(a_keys == b_keys) + + for k in a_keys: + assert_almost_equal(a[k], b[k]) + +def assert_series_equal(left, right, check_dtype=True, + check_index_type=False, + check_index_freq=False, + check_series_type=False): + if check_series_type: + assert(type(left) == type(right)) + assert_almost_equal(left.values, right.values) + if check_dtype: + assert(left.dtype == right.dtype) + assert(left.index.equals(right.index)) + if check_index_type: + assert(type(left.index) == type(right.index)) + assert(left.index.dtype == right.index.dtype) + assert(left.index.inferred_type == right.index.inferred_type) + if check_index_freq: + assert(getattr(left, 'freqstr', None) == + getattr(right, 'freqstr', None)) + +def assert_frame_equal(left, right, check_index_type=False, + check_column_type=False, + check_frame_type=False): + if check_frame_type: + assert(type(left) == type(right)) + assert(isinstance(left, DataFrame)) + assert(isinstance(right, DataFrame)) + for col, series in left.iterkv(): + assert(col in right) + assert_series_equal(series, right[col]) + for col in right: + assert(col in left) + assert(left.index.equals(right.index)) + assert(left.columns.equals(right.columns)) + if check_index_type: + assert(type(left.index) == type(right.index)) + assert(left.index.dtype == right.index.dtype) + assert(left.index.inferred_type == right.index.inferred_type) + if check_column_type: + assert(type(left.columns) == type(right.columns)) + assert(left.columns.dtype == right.columns.dtype) + assert(left.columns.inferred_type == right.columns.inferred_type) + +def assert_panel_equal(left, right, check_panel_type=False): + if check_panel_type: + assert(type(left) == type(right)) + + assert(left.items.equals(right.items)) + assert(left.major_axis.equals(right.major_axis)) + assert(left.minor_axis.equals(right.minor_axis)) + + for col, series in left.iterkv(): + assert(col in right) + assert_frame_equal(series, right[col]) + + for col in right: + assert(col in left) + +def assert_contains_all(iterable, dic): + for k in iterable: + assert(k in dic) + +def getCols(k): + return string.ascii_uppercase[:k] + +def makeStringIndex(k): + return Index([rands(10) for _ in xrange(k)]) + +def makeIntIndex(k): + return Index(range(k)) + +def makeFloatIndex(k): + values = sorted(np.random.random_sample(k)) - np.random.random_sample(1) + return Index(values * (10 ** np.random.randint(0, 9))) + +def makeFloatSeries(): + index = makeStringIndex(N) + return Series(randn(N), index=index) + +def makeStringSeries(): + index = makeStringIndex(N) + return Series(randn(N), index=index) + +def makeObjectSeries(): + dateIndex = makeDateIndex(N) + dateIndex = Index(dateIndex, dtype=object) + index = makeStringIndex(N) + return Series(dateIndex, index=index) + +def getSeriesData(): + index = makeStringIndex(N) + return dict((c, Series(randn(N), index=index)) for c in getCols(K)) + +def makeDataFrame(): + data = getSeriesData() + return DataFrame(data) + +def getArangeMat(): + return np.arange(N * K).reshape((N, K)) + +def getMixedTypeDict(): + index = Index(['a', 'b', 'c', 'd', 'e']) + + data = { + 'A' : [0., 1., 2., 3., 4.], + 'B' : [0., 1., 0., 1., 0.], + 'C' : ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], + 'D' : bdate_range('1/1/2009', periods=5) + } + + return index, data + +def makeDateIndex(k): + dt = datetime(2000,1,1) + dr = bdate_range(dt, periods=k) + return DatetimeIndex(dr) + +def makePeriodIndex(k): + dt = datetime(2000,1,1) + dr = PeriodIndex(start=dt, periods=k, freq='B') + return dr + +def makeTimeSeries(nper=None): + if nper is None: + nper = N + return Series(randn(nper), index=makeDateIndex(nper)) + +def makePeriodSeries(nper=None): + if nper is None: + nper = N + return Series(randn(nper), index=makePeriodIndex(nper)) + +def getTimeSeriesData(): + return dict((c, makeTimeSeries()) for c in getCols(K)) + +def makeTimeDataFrame(): + data = getTimeSeriesData() + return DataFrame(data) + +def getPeriodData(): + return dict((c, makePeriodSeries()) for c in getCols(K)) + +def makePeriodFrame(): + data = getPeriodData() + return DataFrame(data) + +def makePanel(): + cols = ['Item' + c for c in string.ascii_uppercase[:K - 1]] + data = dict((c, makeTimeDataFrame()) for c in cols) + return Panel.fromDict(data) + +def add_nans(panel): + I, J, N = panel.shape + for i, item in enumerate(panel.items): + dm = panel[item] + for j, col in enumerate(dm.columns): + dm[col][:i + j] = np.NaN + +class TestSubDict(dict): + def __init__(self, *args, **kwargs): + dict.__init__(self, *args, **kwargs) + + +# Dependency checks. Copied this from Nipy/Nipype (Copyright of +# respective developers, license: BSD-3) +def package_check(pkg_name, version=None, app='pandas', checker=LooseVersion, + exc_failed_import=ImportError, + exc_failed_check=RuntimeError): + """Check that the minimal version of the required package is installed. + + Parameters + ---------- + pkg_name : string + Name of the required package. + version : string, optional + Minimal version number for required package. + app : string, optional + Application that is performing the check. For instance, the + name of the tutorial being executed that depends on specific + packages. + checker : object, optional + The class that will perform the version checking. Default is + distutils.version.LooseVersion. + exc_failed_import : Exception, optional + Class of the exception to be thrown if import failed. + exc_failed_check : Exception, optional + Class of the exception to be thrown if version check failed. + + Examples + -------- + package_check('numpy', '1.3') + package_check('networkx', '1.0', 'tutorial1') + + """ + + if app: + msg = '%s requires %s' % (app, pkg_name) + else: + msg = 'module requires %s' % pkg_name + if version: + msg += ' with version >= %s' % (version,) + try: + mod = __import__(pkg_name) + except ImportError: + raise exc_failed_import(msg) + if not version: + return + try: + have_version = mod.__version__ + except AttributeError: + raise exc_failed_check('Cannot find version for %s' % pkg_name) + if checker(have_version) < checker(version): + raise exc_failed_check(msg) + +def skip_if_no_package(*args, **kwargs): + """Raise SkipTest if package_check fails + + Parameters + ---------- + *args Positional parameters passed to `package_check` + *kwargs Keyword parameters passed to `package_check` + """ + from nose import SkipTest + package_check(exc_failed_import=SkipTest, + exc_failed_check=SkipTest, + *args, **kwargs) + +# +# Additional tags decorators for nose +# +def network(t): + """ + Label a test as requiring network connection. + + In some cases it is not possible to assume network presence (e.g. Debian + build hosts). + + Parameters + ---------- + t : callable + The test requiring network connectivity. + + Returns + ------- + t : callable + The decorated test `t`. + + Examples + -------- + A test can be decorated as requiring network like this:: + + from pandas.util.testing import * + + @network + def test_network(self): + print 'Fetch the stars from http://' + + And use ``nosetests -a '!network'`` to exclude running tests requiring + network connectivity. + """ + + t.network = True + return t diff --git a/scripts/bench_join.R b/scripts/bench_join.R new file mode 100644 index 00000000..edba277f --- /dev/null +++ b/scripts/bench_join.R @@ -0,0 +1,50 @@ +library(xts) + +iterations <- 50 + +ns = c(100, 1000, 10000, 100000, 1000000) +kinds = c("outer", "left", "inner") + +result = matrix(0, nrow=3, ncol=length(ns)) +n <- 100000 +pct.overlap <- 0.2 + +k <- 1 + +for (ni in 1:length(ns)){ + n <- ns[ni] + rng1 <- 1:n + offset <- as.integer(n * pct.overlap) + rng2 <- rng1 + offset + x <- xts(matrix(rnorm(n * k), nrow=n, ncol=k), + as.POSIXct(Sys.Date()) + rng1) + y <- xts(matrix(rnorm(n * k), nrow=n, ncol=k), + as.POSIXct(Sys.Date()) + rng2) + timing <- numeric() + for (i in 1:3) { + kind = kinds[i] + for(j in 1:iterations) { + gc() # just to be sure + timing[j] <- system.time(merge(x,y,join=kind))[3] + } + #timing <- system.time(for (j in 1:iterations) merge.xts(x, y, join=kind), + # gcFirst=F) + #timing <- as.list(timing) + result[i, ni] <- mean(timing) * 1000 + #result[i, ni] = (timing$elapsed / iterations) * 1000 + } +} + +rownames(result) <- kinds +colnames(result) <- log10(ns) + +mat <- matrix(rnorm(500000), nrow=100000, ncol=5) +set.seed(12345) +indexer <- sample(1:100000) + +timing <- rep(0, 10) +for (i in 1:10) { + gc() + timing[i] = system.time(mat[indexer,])[3] +} + diff --git a/scripts/bench_join.py b/scripts/bench_join.py new file mode 100644 index 00000000..d838cf93 --- /dev/null +++ b/scripts/bench_join.py @@ -0,0 +1,197 @@ +import numpy as np +import pandas.lib as lib +from pandas import * +from copy import deepcopy +import time + +n = 1000000 +K = 1 +pct_overlap = 0.2 + +a = np.arange(n, dtype=np.int64) +b = np.arange(n * pct_overlap, n*(1+pct_overlap), dtype=np.int64) + +dr1 = DateRange('1/1/2000', periods=n, offset=datetools.Minute()) +dr2 = DateRange(dr1[int(pct_overlap*n)], periods=n, offset=datetools.Minute(2)) + +aobj = a.astype(object) +bobj = b.astype(object) + +av = np.random.randn(n) +bv = np.random.randn(n) + +avf = np.random.randn(n, K) +bvf = np.random.randn(n, K) + +a_series = Series(av, index=a) +b_series = Series(bv, index=b) + +a_frame = DataFrame(avf, index=a, columns=range(K)) +b_frame = DataFrame(bvf, index=b, columns=range(K, 2 * K)) + +def do_left_join(a, b, av, bv): + out = np.empty((len(a), 2)) + lib.left_join_1d(a, b, av, bv, out) + return out + +def do_outer_join(a, b, av, bv): + result_index, aindexer, bindexer = lib.outer_join_indexer(a, b) + result = np.empty((2, len(result_index))) + lib.take_1d(av, aindexer, result[0]) + lib.take_1d(bv, bindexer, result[1]) + return result_index, result + +def do_inner_join(a, b, av, bv): + result_index, aindexer, bindexer = lib.inner_join_indexer(a, b) + result = np.empty((2, len(result_index))) + lib.take_1d(av, aindexer, result[0]) + lib.take_1d(bv, bindexer, result[1]) + return result_index, result + +from line_profiler import LineProfiler +prof = LineProfiler() + +from pandas.util.testing import set_trace + +def do_left_join_python(a, b, av, bv): + indexer, mask = lib.ordered_left_join_int64(a, b) + + n, ak = av.shape + _, bk = bv.shape + result_width = ak + bk + + result = np.empty((result_width, n), dtype=np.float64) + result[:ak] = av.T + + bchunk = result[ak:] + _take_multi(bv.T, indexer, bchunk) + np.putmask(bchunk, np.tile(mask, bk), np.nan) + return result + +def _take_multi(data, indexer, out): + if not data.flags.c_contiguous: + data = data.copy() + for i in xrange(data.shape[0]): + data[i].take(indexer, out=out[i]) + +def do_left_join_multi(a, b, av, bv): + n, ak = av.shape + _, bk = bv.shape + result = np.empty((n, ak + bk), dtype=np.float64) + lib.left_join_2d(a, b, av, bv, result) + return result + +def do_outer_join_multi(a, b, av, bv): + n, ak = av.shape + _, bk = bv.shape + result_index, rindexer, lindexer = lib.outer_join_indexer(a, b) + result = np.empty((len(result_index), ak + bk), dtype=np.float64) + lib.take_join_contiguous(av, bv, lindexer, rindexer, result) + # result = np.empty((ak + bk, len(result_index)), dtype=np.float64) + # lib.take_axis0(av, rindexer, out=result[:ak].T) + # lib.take_axis0(bv, lindexer, out=result[ak:].T) + return result_index, result + +def do_inner_join_multi(a, b, av, bv): + n, ak = av.shape + _, bk = bv.shape + result_index, rindexer, lindexer = lib.inner_join_indexer(a, b) + result = np.empty((len(result_index), ak + bk), dtype=np.float64) + lib.take_join_contiguous(av, bv, lindexer, rindexer, result) + # result = np.empty((ak + bk, len(result_index)), dtype=np.float64) + # lib.take_axis0(av, rindexer, out=result[:ak].T) + # lib.take_axis0(bv, lindexer, out=result[ak:].T) + return result_index, result + +def do_left_join_multi_v2(a, b, av, bv): + indexer, mask = lib.ordered_left_join_int64(a, b) + bv_taken = bv.take(indexer, axis=0) + np.putmask(bv_taken, mask.repeat(bv.shape[1]), np.nan) + return np.concatenate((av, bv_taken), axis=1) + + +def do_left_join_series(a, b): + return b.reindex(a.index) + +def do_left_join_frame(a, b): + a.index._indexMap = None + b.index._indexMap = None + return a.join(b, how='left') + + +# a = np.array([1, 2, 3, 4, 5], dtype=np.int64) +# b = np.array([0, 3, 5, 7, 9], dtype=np.int64) +# print lib.inner_join_indexer(a, b) + +out = np.empty((10, 120000)) + +def join(a, b, av, bv, how="left"): + func_dict = {'left' : do_left_join_multi, + 'outer' : do_outer_join_multi, + 'inner' : do_inner_join_multi} + + f = func_dict[how] + return f(a, b, av, bv) + +def bench_python(n=100000, pct_overlap=0.20, K=1): + import gc + ns = [2, 3, 4, 5, 6] + iterations = 200 + pct_overlap = 0.2 + kinds = ['outer', 'left', 'inner'] + + all_results = {} + for logn in ns: + n = 10**logn + a = np.arange(n, dtype=np.int64) + b = np.arange(n * pct_overlap, n * pct_overlap + n, dtype=np.int64) + + avf = np.random.randn(n, K) + bvf = np.random.randn(n, K) + + a_frame = DataFrame(avf, index=a, columns=range(K)) + b_frame = DataFrame(bvf, index=b, columns=range(K, 2 * K)) + + all_results[logn] = result = {} + + for kind in kinds: + gc.disable() + elapsed = 0 + _s = time.clock() + for i in range(iterations): + if i % 10 == 0: + elapsed += time.clock() - _s + gc.collect() + _s = time.clock() + a_frame.join(b_frame, how=kind) + # join(a, b, avf, bvf, how=kind) + elapsed += time.clock() - _s + gc.enable() + result[kind] = (elapsed / iterations) * 1000 + + return DataFrame(all_results, index=kinds) + +def bench_xts(n=100000, pct_overlap=0.20): + from pandas.rpy.common import r + r('a <- 5') + + xrng = '1:%d' % n + + start = n * pct_overlap + 1 + end = n + start - 1 + yrng = '%d:%d' % (start, end) + + r('library(xts)') + + iterations = 500 + + kinds = ['left', 'outer', 'inner'] + result = {} + for kind in kinds: + r('x <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, xrng)) + r('y <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, yrng)) + stmt = 'for (i in 1:%d) merge(x, y, join="%s")' % (iterations, kind) + elapsed = r('as.list(system.time(%s, gcFirst=F))$elapsed' % stmt)[0] + result[kind] = (elapsed / iterations) * 1000 + return Series(result) + diff --git a/scripts/bench_join_multi.py b/scripts/bench_join_multi.py new file mode 100644 index 00000000..a0babaf5 --- /dev/null +++ b/scripts/bench_join_multi.py @@ -0,0 +1,30 @@ +from pandas import * + +import numpy as np +from itertools import izip +from pandas.util.testing import rands +import pandas.lib as lib + +N = 100000 + +key1 = [rands(10) for _ in xrange(N)] +key2 = [rands(10) for _ in xrange(N)] + +zipped = izip(key1, key2) + +def _zip(*args): + arr = np.empty(N, dtype=object) + arr[:] = zip(*args) + return arr + +def _zip2(*args): + return lib.list_to_object_array(zip(*args)) + +index = MultiIndex.from_arrays([key1, key2]) +to_join = DataFrame({'j1' : np.random.randn(100000)}, index=index) + +data = DataFrame({'A' : np.random.randn(500000), + 'key1' : np.repeat(key1, 5), + 'key2' : np.repeat(key2, 5)}) + +# data.join(to_join, on=['key1', 'key2']) diff --git a/scripts/bench_refactor.py b/scripts/bench_refactor.py new file mode 100644 index 00000000..5ae36f7d --- /dev/null +++ b/scripts/bench_refactor.py @@ -0,0 +1,46 @@ +from pandas import * +try: + import pandas.core.internals as internals + reload(internals) + import pandas.core.frame as frame + reload(frame) + from pandas.core.frame import DataFrame as DataMatrix +except ImportError: + pass + +N = 1000 +K = 500 + +def horribly_unconsolidated(): + index = np.arange(N) + + df = DataMatrix(index=index) + + for i in xrange(K): + df[i] = float(K) + + return df + +def bench_reindex_index(df, it=100): + new_idx = np.arange(0, N, 2) + for i in xrange(it): + df.reindex(new_idx) + +def bench_reindex_columns(df, it=100): + new_cols = np.arange(0, K, 2) + for i in xrange(it): + df.reindex(columns=new_cols) + +def bench_join_index(df, it=10): + left = df.reindex(index=np.arange(0, N, 2), + columns=np.arange(K // 2)) + right = df.reindex(columns=np.arange(K // 2 + 1, K)) + for i in xrange(it): + joined = left.join(right) + +if __name__ == '__main__': + df = horribly_unconsolidated() + left = df.reindex(index=np.arange(0, N, 2), + columns=np.arange(K // 2)) + right = df.reindex(columns=np.arange(K // 2 + 1, K)) + bench_join_index(df) diff --git a/scripts/boxplot_test.py b/scripts/boxplot_test.py new file mode 100644 index 00000000..3704f7b6 --- /dev/null +++ b/scripts/boxplot_test.py @@ -0,0 +1,14 @@ +import matplotlib.pyplot as plt + +import random +import pandas.util.testing as tm +tm.N = 1000 +df = tm.makeTimeDataFrame() +import string +foo = list(string.letters[:5]) * 200 +df['indic'] = list(string.letters[:5]) * 200 +random.shuffle(foo) +df['indic2'] = foo +df.boxplot(by=['indic', 'indic2'], fontsize=8, rot=90) + +plt.show() diff --git a/scripts/count_code.sh b/scripts/count_code.sh new file mode 100755 index 00000000..a4db9560 --- /dev/null +++ b/scripts/count_code.sh @@ -0,0 +1 @@ +cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c|plib.c" \ No newline at end of file diff --git a/scripts/faster_xs.py b/scripts/faster_xs.py new file mode 100644 index 00000000..a539642b --- /dev/null +++ b/scripts/faster_xs.py @@ -0,0 +1,16 @@ +import numpy as np + +import pandas.util.testing as tm + +from pandas.core.internals import _interleaved_dtype + +df = tm.makeDataFrame() + +df['E'] = 'foo' +df['F'] = 'foo' +df['G'] = 2 +df['H'] = df['A'] > 0 + +blocks = df._data.blocks +items = df.columns + diff --git a/scripts/file_sizes.py b/scripts/file_sizes.py new file mode 100644 index 00000000..edbd23c8 --- /dev/null +++ b/scripts/file_sizes.py @@ -0,0 +1,198 @@ +import os +import sys + +import numpy as np +import matplotlib.pyplot as plt + +from pandas import DataFrame +from pandas.util.testing import set_trace + +dirs = [] +names = [] +lengths = [] + +if len(sys.argv) > 1: + loc = sys.argv[1] +else: + loc = '.' +walked = os.walk(loc) + +def _should_count_file(path): + return path.endswith('.py') or path.endswith('.pyx') + +def _is_def_line(line): + """def/cdef/cpdef, but not `cdef class`""" + return (line.endswith(':') and not 'class' in line.split() and + (line.startswith('def ') or + line.startswith('cdef ') or + line.startswith('cpdef ') or + ' def ' in line or ' cdef ' in line or ' cpdef ' in line)) + +class LengthCounter(object): + """ + should add option for subtracting nested function lengths?? + """ + def __init__(self, lines): + self.lines = lines + self.pos = 0 + self.counts = [] + self.n = len(lines) + + def get_counts(self): + self.pos = 0 + self.counts = [] + while self.pos < self.n: + line = self.lines[self.pos] + self.pos += 1 + if _is_def_line(line): + level = _get_indent_level(line) + self._count_function(indent_level=level) + return self.counts + + def _count_function(self, indent_level=1): + indent = ' ' * indent_level + + def _end_of_function(line): + return (line != '' and + not line.startswith(indent) and + not line.startswith('#')) + + start_pos = self.pos + while self.pos < self.n: + line = self.lines[self.pos] + if _end_of_function(line): + self._push_count(start_pos) + return + + self.pos += 1 + + if _is_def_line(line): + self._count_function(indent_level=indent_level + 1) + + # end of file + self._push_count(start_pos) + + def _push_count(self, start_pos): + func_lines = self.lines[start_pos:self.pos] + + if len(func_lines) > 300: + set_trace() + + # remove blank lines at end + while len(func_lines) > 0 and func_lines[-1] == '': + func_lines = func_lines[:-1] + + # remove docstrings and comments + clean_lines = [] + in_docstring = False + for line in func_lines: + line = line.strip() + if in_docstring and _is_triplequote(line): + in_docstring = False + continue + + if line.startswith('#'): + continue + + if _is_triplequote(line): + in_docstring = True + continue + + self.counts.append(len(func_lines)) + +def _get_indent_level(line): + level = 0 + while line.startswith(' ' * level): + level += 1 + return level + +def _is_triplequote(line): + return line.startswith('"""') or line.startswith("'''") + +def _get_file_function_lengths(path): + lines = [x.rstrip() for x in open(path).readlines()] + counter = LengthCounter(lines) + return counter.get_counts() + +# def test_get_function_lengths(): +text = """ +class Foo: + +def foo(): + def bar(): + a = 1 + + b = 2 + + c = 3 + + foo = 'bar' + +def x(): + a = 1 + + b = 3 + + c = 7 + + pass +""" + +expected = [5, 8, 7] + +lines = [x.rstrip() for x in text.splitlines()] +counter = LengthCounter(lines) +result = counter.get_counts() +assert(result == expected) + +def doit(): + for directory, _, files in walked: + print directory + for path in files: + if not _should_count_file(path): + continue + + full_path = os.path.join(directory, path) + print full_path + lines = len(open(full_path).readlines()) + + dirs.append(directory) + names.append(path) + lengths.append(lines) + + result = DataFrame({'dirs' : dirs, 'names' : names, + 'lengths' : lengths}) + +def doit2(): + counts = {} + for directory, _, files in walked: + print directory + for path in files: + if not _should_count_file(path) or path.startswith('test_'): + continue + + full_path = os.path.join(directory, path) + counts[full_path] = _get_file_function_lengths(full_path) + + return counts + +counts = doit2() + +# counts = _get_file_function_lengths('pandas/tests/test_series.py') + +all_counts = [] +for k, v in counts.iteritems(): + all_counts.extend(v) +all_counts = np.array(all_counts) + +fig = plt.figure(figsize=(10, 5)) +ax = fig.add_subplot(111) +ax.hist(all_counts, bins=100) +n = len(all_counts) +nmore = (all_counts > 50).sum() +ax.set_title('%s function lengths, n=%d' % ('pandas', n)) +ax.set_ylabel('N functions') +ax.set_xlabel('Function length') +ax.text(100, 300, '%.3f%% with > 50 lines' % ((n - nmore) / float(n)), + fontsize=18) +plt.show() diff --git a/scripts/git-mrb b/scripts/git-mrb new file mode 100644 index 00000000..5b48cd9c --- /dev/null +++ b/scripts/git-mrb @@ -0,0 +1,82 @@ +#!/usr/bin/env python +"""git-mrb: merge remote branch. + +git mrb [remote:branch OR remote-branch] [onto] [upstream] + +remote must be locally available, and branch must exist in that remote. + +If 'onto' branch isn't given, default is 'master'. + +If 'upstream' repository isn't given, default is 'origin'. + +You can separate the remote and branch spec with either a : or a -. + +Taken from IPython project +""" +#----------------------------------------------------------------------------- +# Imports +#----------------------------------------------------------------------------- + +from subprocess import check_call +import sys + +#----------------------------------------------------------------------------- +# Functions +#----------------------------------------------------------------------------- + +def sh(cmd): + cmd = cmd.format(**shvars) + print '$', cmd + check_call(cmd, shell=True) + +#----------------------------------------------------------------------------- +# Main Script +#----------------------------------------------------------------------------- + +argv = sys.argv[1:] +narg = len(argv) + +try: + branch_spec = argv[0] + sep = ':' if ':' in branch_spec else '-' + remote, branch = branch_spec.split(':', 1) + if not branch: + raise ValueError('Branch spec %s invalid, branch not found' % + branch_spec) +except: + import traceback as tb + tb.print_exc() + print __doc__ + sys.exit(1) + +onto = argv[1] if narg >= 2 else 'master' +upstream = argv[1] if narg == 3 else 'origin' + +# Git doesn't like ':' in branch names. +if sep == ':': + branch_spec = branch_spec.replace(':', '-') + +# Global used by sh +shvars = dict(remote=remote, branch_spec=branch_spec, branch=branch, + onto=onto, upstream=upstream) + +# Start git calls. +sh('git fetch {remote}') +sh('git checkout -b {branch_spec} {onto}') +sh('git merge {remote}/{branch}') + +print """ +************************************************************* + Run test suite. If tests pass, run the following to merge: + +git checkout {onto} +git merge {branch_spec} +git push {upstream} {onto} + +************************************************************* +""".format(**shvars) + +ans = raw_input("Revert to master and delete temporary branch? [Y/n]: ") +if ans.strip().lower() in ('', 'y', 'yes'): + sh('git checkout {onto}') + sh('git branch -D {branch_spec}') \ No newline at end of file diff --git a/scripts/git_code_churn.py b/scripts/git_code_churn.py new file mode 100644 index 00000000..3e999aec --- /dev/null +++ b/scripts/git_code_churn.py @@ -0,0 +1,35 @@ +from dateutil import parser +import subprocess +import os +import re +import sys + +import numpy as np + +from pandas import * + + +if __name__ == '__main__': + from vbench.git import GitRepo + repo = GitRepo('/Users/wesm/code/pandas') + churn = repo.get_churn_by_file() + + file_include = [] + for path in churn.major_axis: + if path.endswith('.pyx') or path.endswith('.py'): + file_include.append(path) + commits_include = [sha for sha in churn.minor_axis + if 'LF' not in repo.messages[sha]] + commits_include.remove('dcf3490') + + clean_churn = churn.reindex(major=file_include, minor=commits_include) + + by_commit = clean_churn.sum('major').sum(1) + + by_date = by_commit.groupby(repo.commit_date).sum() + + by_date = by_date.drop([datetime(2011, 6, 10)]) + + # clean out days where I touched Cython + + by_date = by_date[by_date < 5000] diff --git a/scripts/groupby_sample.py b/scripts/groupby_sample.py new file mode 100644 index 00000000..63638ede --- /dev/null +++ b/scripts/groupby_sample.py @@ -0,0 +1,49 @@ +from pandas import * +import numpy as np +import string + +g1 = np.array(list(string.letters))[:-1] +g2 = np.arange(510) +df_small = DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c", + "c","a","a","a","b","b","b","b"], + 'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1], + 'value' : ["apple","pear","orange","apple", + "banana","durian","lemon","lime", + "raspberry","durian","peach","nectarine", + "banana","lemon","guava","blackberry", + "grape"]}) +value = df_small['value'].values.repeat(3) +df = DataFrame({'group1' : g1.repeat(4000 * 5), + 'group2' : np.tile(g2, 400 * 5), + 'value' : value.repeat(4000 * 5)}) + + +def random_sample(): + grouped = df.groupby(['group1','group2'])['value'] + from random import choice + choose = lambda group: choice(group.index) + indices = grouped.apply(choose) + return df.reindex(indices) + +def random_sample_v2(): + grouped = df.groupby(['group1','group2'])['value'] + from random import choice + choose = lambda group: choice(group.index) + indices = [choice(v) for k, v in grouped.groups.iteritems()] + return df.reindex(indices) + +def do_shuffle(arr): + from random import shuffle + result = arr.copy().values + shuffle(result) + return result + +def shuffle_uri(df,grouped): + perm = np.r_[tuple([np.random.permutation(idxs) for idxs in grouped.groups.itervalues()])] + df['state_permuted'] = np.asarray(df.ix[perm]['value']) + +df2 = df.copy() +grouped = df2.groupby('group1') +shuffle_uri(df2, grouped) + +df2['state_perm'] = grouped['value'].transform(do_shuffle) diff --git a/scripts/groupby_speed.py b/scripts/groupby_speed.py new file mode 100644 index 00000000..c0fa4495 --- /dev/null +++ b/scripts/groupby_speed.py @@ -0,0 +1,31 @@ +from pandas import * + +rng = DateRange('1/3/2011', '11/30/2011', offset=datetools.Minute()) + +df = DataFrame(np.random.randn(len(rng), 5), index=rng, + columns=list('OHLCV')) + +rng5 = DateRange('1/3/2011', '11/30/2011', offset=datetools.Minute(5)) +gp = rng5.asof +grouped = df.groupby(gp) + +def get1(dt): + k = gp(dt) + return grouped.get_group(k) + +def get2(dt): + k = gp(dt) + return df.ix[grouped.groups[k]] + +def f(): + for i, date in enumerate(df.index): + if i % 10000 == 0: + print i + get1(date) + +def g(): + for i, date in enumerate(df.index): + if i % 10000 == 0: + print i + get2(date) + diff --git a/scripts/groupby_test.py b/scripts/groupby_test.py new file mode 100644 index 00000000..6e4177e2 --- /dev/null +++ b/scripts/groupby_test.py @@ -0,0 +1,142 @@ +from collections import defaultdict + +from numpy import nan +import numpy as np + +from pandas import * + +import pandas.lib as tseries +import pandas.core.groupby as gp +import pandas.util.testing as tm +reload(gp) + +""" + +k = 1000 +values = np.random.randn(8 * k) +key1 = np.array(['foo', 'bar', 'baz', 'bar', 'foo', 'baz', 'bar', 'baz'] * k, + dtype=object) +key2 = np.array(['b', 'b', 'b', 'b', 'a', 'a', 'a', 'a' ] * k, + dtype=object) +shape, labels, idicts = gp.labelize(key1, key2) + +print tseries.group_labels(key1) + +# print shape +# print labels +# print idicts + +result = tseries.group_aggregate(values, labels, shape) + +print tseries.groupby_indices(key2) + +df = DataFrame({'key1' : key1, + 'key2' : key2, + 'v1' : values, + 'v2' : values}) +k1 = df['key1'] +k2 = df['key2'] + +# del df['key1'] +# del df['key2'] + +# r2 = gp.multi_groupby(df, np.sum, k1, k2) + +# print result + +gen = gp.generate_groups(df['v1'], labels, shape, axis=1, + factory=DataFrame) + +res = defaultdict(dict) +for a, gen1 in gen: + for b, group in gen1: + print a, b + print group + # res[b][a] = group['values'].sum() + res[b][a] = group.sum() + +res = DataFrame(res) + +grouped = df.groupby(['key1', 'key2']) +""" + +# data = {'A' : [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], +# 'B' : ['A', 'B'] * 6, +# 'C' : np.random.randn(12)} +# df = DataFrame(data) +# df['C'][2:10:2] = nan + +# single column +# grouped = df.drop(['B'], axis=1).groupby('A') +# exp = {} +# for cat, group in grouped: +# exp[cat] = group['C'].sum() +# exp = DataFrame({'C' : exp}) +# result = grouped.sum() + +# grouped = df.groupby(['A', 'B']) +# expd = {} +# for cat1, cat2, group in grouped: +# expd.setdefault(cat1, {})[cat2] = group['C'].sum() +# exp = DataFrame(expd).T.stack() +# result = grouped.sum()['C'] + +# print 'wanted' +# print exp +# print 'got' +# print result + +# tm.N = 10000 + +# mapping = {'A': 0, 'C': 1, 'B': 0, 'D': 1} +# tf = lambda x: x - x.mean() + +# df = tm.makeTimeDataFrame() +# ts = df['A'] + +# # grouped = df.groupby(lambda x: x.strftime('%m/%y')) +# grouped = df.groupby(mapping, axis=1) +# groupedT = df.T.groupby(mapping, axis=0) + +# r1 = groupedT.transform(tf).T +# r2 = grouped.transform(tf) + +# fillit = lambda x: x.fillna(method='pad') + +# f = lambda x: x + +# transformed = df.groupby(lambda x: x.strftime('%m/%y')).transform(lambda x: x) + +# def ohlc(group): +# return Series([group[0], group.max(), group.min(), group[-1]], +# index=['open', 'high', 'low', 'close']) +# grouper = [lambda x: x.year, lambda x: x.month] +# dr = DateRange('1/1/2000', '1/1/2002') +# ts = Series(np.random.randn(len(dr)), index=dr) + +# import string + +# k = 20 +# n = 1000 + +# keys = list(string.letters[:k]) + +# df = DataFrame({'A' : np.tile(keys, n), +# 'B' : np.repeat(keys[:k/2], n * 2), +# 'C' : np.random.randn(k * n)}) + +# def f(): +# for x in df.groupby(['A', 'B']): +# pass + +a = np.arange(100).repeat(100) +b = np.tile(np.arange(100), 100) +index = MultiIndex.from_arrays([a, b]) +s = Series(np.random.randn(len(index)), index) +df = DataFrame({'A' : s}) +df['B'] = df.index.get_level_values(0) +df['C'] = df.index.get_level_values(1) + +def f(): + for x in df.groupby(['B', 'B']): + pass diff --git a/scripts/hdfstore_panel_perf.py b/scripts/hdfstore_panel_perf.py new file mode 100644 index 00000000..d344fc80 --- /dev/null +++ b/scripts/hdfstore_panel_perf.py @@ -0,0 +1,16 @@ +from pandas import * +from pandas.util.testing import rands + +i, j, k = 7, 771, 5532 + +panel = Panel(np.random.randn(i, j, k), + items=[rands(10) for _ in xrange(i)], + major_axis=DateRange('1/1/2000', periods=j, + offset=datetools.Minute()), + minor_axis=[rands(10) for _ in xrange(k)]) + + +store = HDFStore('test.h5') +store.put('test_panel', panel, table=True) + +retrieved = store['test_panel'] diff --git a/scripts/leak.py b/scripts/leak.py new file mode 100644 index 00000000..3d704af4 --- /dev/null +++ b/scripts/leak.py @@ -0,0 +1,12 @@ +from pandas import * +import numpy as np +import pandas.util.testing as tm +import os +import psutil + +pid = os.getpid() +proc = psutil.Process(pid) + +df = DataFrame(index=np.arange(100)) +for i in range(5000): + df[i] = 5 diff --git a/scripts/parser_magic.py b/scripts/parser_magic.py new file mode 100644 index 00000000..4eec900b --- /dev/null +++ b/scripts/parser_magic.py @@ -0,0 +1,67 @@ +from pandas.util.testing import set_trace +import pandas.util.testing as tm + +from pandas import * +import ast +import inspect +import sys + +def merge(a, b): + f, args, _ = parse_stmt(inspect.currentframe().f_back) + return DataFrame({args[0] : a, + args[1] : b}) + +def parse_stmt(frame): + info = inspect.getframeinfo(frame) + call = info[-2][0] + mod = ast.parse(call) + body = mod.body[0] + if isinstance(body, (ast.Assign, ast.Expr)): + call = body.value + elif isinstance(body, ast.Call): + call = body + return _parse_call(call) + +def _parse_call(call): + func = _maybe_format_attribute(call.func) + + str_args = [] + for arg in call.args: + if isinstance(arg, ast.Name): + str_args.append(arg.id) + elif isinstance(arg, ast.Call): + formatted = _format_call(arg) + str_args.append(formatted) + + return func, str_args, {} + +def _format_call(call): + func, args, kwds = _parse_call(call) + content = '' + if args: + content += ', '.join(args) + if kwds: + fmt_kwds = ['%s=%s' % item for item in kwds.iteritems()] + joined_kwds = ', '.join(fmt_kwds) + if args: + content = content + ', ' + joined_kwds + else: + content += joined_kwds + return '%s(%s)' % (func, content) + +def _maybe_format_attribute(name): + if isinstance(name, ast.Attribute): + return _format_attribute(name) + return name.id + +def _format_attribute(attr): + obj = attr.value + if isinstance(attr.value, ast.Attribute): + obj = _format_attribute(attr.value) + else: + obj = obj.id + return '.'.join((obj, attr.attr)) + +a = tm.makeTimeSeries() +b = tm.makeTimeSeries() +df = merge(a, b) diff --git a/scripts/preepoch_test.py b/scripts/preepoch_test.py new file mode 100644 index 00000000..b65f09c8 --- /dev/null +++ b/scripts/preepoch_test.py @@ -0,0 +1,22 @@ +import numpy as np +from pandas import * + +def panda_test(): + + # generate some data + data = np.random.rand(50,5) + # generate some dates + dates = DateRange('1/1/1969',periods=50) + # generate column headings + cols = ['A','B','C','D','E'] + + df = DataFrame(data,index=dates,columns=cols) + + # save to HDF5Store + store = HDFStore('bugzilla.h5', mode='w') + store['df'] = df # This gives: OverflowError: mktime argument out of range + store.close() + + +if __name__ == '__main__': + panda_test() diff --git a/scripts/roll_median_leak.py b/scripts/roll_median_leak.py new file mode 100644 index 00000000..b7e41239 --- /dev/null +++ b/scripts/roll_median_leak.py @@ -0,0 +1,24 @@ +from pandas import * + +import numpy as np +import os + +from vbench.api import Benchmark +from pandas.util.testing import rands +import pandas.lib as lib +import pandas._sandbox as sbx +import time + +import psutil + +pid = os.getpid() +proc = psutil.Process(pid) + +lst = SparseList() +lst.append([5] * 10000) +lst.append(np.repeat(np.nan, 1000000)) + +for _ in xrange(10000): + print proc.get_memory_info() + sdf = SparseDataFrame({'A' : lst.to_array()}) + chunk = sdf[sdf['A'] == 5] diff --git a/scripts/runtests.py b/scripts/runtests.py new file mode 100644 index 00000000..7816ac25 --- /dev/null +++ b/scripts/runtests.py @@ -0,0 +1,3 @@ +import os; print os.getpid() +import nose +nose.main('pandas.core') diff --git a/scripts/test_py25.bat b/scripts/test_py25.bat new file mode 100644 index 00000000..fbf00b04 --- /dev/null +++ b/scripts/test_py25.bat @@ -0,0 +1,8 @@ +SET PATH=C:\MinGW\bin;C:\Python25;C:\Python25\Scripts;%PATH% +del pandas\_tseries.pyd +del pandas\_sparse.pyd +del pandas\src\tseries.c +del pandas\src\sparse.c +python setup.py clean +python setup.py build_ext -c mingw32 --inplace +nosetests pandas \ No newline at end of file diff --git a/scripts/test_py26.bat b/scripts/test_py26.bat new file mode 100644 index 00000000..e2502e87 --- /dev/null +++ b/scripts/test_py26.bat @@ -0,0 +1,8 @@ +SET PATH=C:\MinGW\bin;E:\Python26;E:\Python26\Scripts;%PATH% +del pandas\_tseries.pyd +del pandas\_sparse.pyd +del pandas\src\tseries.c +del pandas\src\sparse.c +python setup.py clean +python setup.py build_ext -c mingw32 --inplace +nosetests pandas \ No newline at end of file diff --git a/scripts/test_py27.bat b/scripts/test_py27.bat new file mode 100644 index 00000000..11e30562 --- /dev/null +++ b/scripts/test_py27.bat @@ -0,0 +1,6 @@ +SET PATH=C:\MinGW\bin;C:\Python27;C:\Python27\Scripts;%PATH% + +python setup.py clean +python setup.py build_ext -c mingw32 --inplace + +nosetests pandas \ No newline at end of file diff --git a/scripts/test_py31.bat b/scripts/test_py31.bat new file mode 100644 index 00000000..e146ef28 --- /dev/null +++ b/scripts/test_py31.bat @@ -0,0 +1,8 @@ +set BASE=E:\python31 +set PYTHON=%BASE%\python.exe +set NOSETESTS=%BASE%\scripts\nosetests-script.py + +%PYTHON% setup.py install +cd bench +%PYTHON% %NOSETESTS% pandas +cd .. \ No newline at end of file diff --git a/scripts/test_py32.bat b/scripts/test_py32.bat new file mode 100644 index 00000000..31685ae4 --- /dev/null +++ b/scripts/test_py32.bat @@ -0,0 +1,8 @@ +set BASE=E:\python32 +set PYTHON=%BASE%\python.exe +set NOSETESTS=%BASE%\scripts\nosetests-script.py + +%PYTHON% setup.py install +cd bench +%PYTHON% %NOSETESTS% pandas +cd .. \ No newline at end of file diff --git a/scripts/testmed.py b/scripts/testmed.py new file mode 100644 index 00000000..1184fee8 --- /dev/null +++ b/scripts/testmed.py @@ -0,0 +1,161 @@ +## {{{ Recipe 576930 (r10): Efficient Running Median using an Indexable Skiplist + +from random import random +from math import log, ceil + +class Node(object): + __slots__ = 'value', 'next', 'width' + def __init__(self, value, next, width): + self.value, self.next, self.width = value, next, width + +class End(object): + 'Sentinel object that always compares greater than another object' + def __cmp__(self, other): + return 1 + +NIL = Node(End(), [], []) # Singleton terminator node + +class IndexableSkiplist: + 'Sorted collection supporting O(lg n) insertion, removal, and lookup by rank.' + + def __init__(self, expected_size=100): + self.size = 0 + self.maxlevels = int(1 + log(expected_size, 2)) + self.head = Node('HEAD', [NIL]*self.maxlevels, [1]*self.maxlevels) + + def __len__(self): + return self.size + + def __getitem__(self, i): + node = self.head + i += 1 + for level in reversed(range(self.maxlevels)): + while node.width[level] <= i: + i -= node.width[level] + node = node.next[level] + return node.value + + def insert(self, value): + # find first node on each level where node.next[levels].value > value + chain = [None] * self.maxlevels + steps_at_level = [0] * self.maxlevels + node = self.head + for level in reversed(range(self.maxlevels)): + while node.next[level].value <= value: + steps_at_level[level] += node.width[level] + node = node.next[level] + chain[level] = node + + # insert a link to the newnode at each level + d = min(self.maxlevels, 1 - int(log(random(), 2.0))) + newnode = Node(value, [None]*d, [None]*d) + steps = 0 + for level in range(d): + prevnode = chain[level] + newnode.next[level] = prevnode.next[level] + prevnode.next[level] = newnode + newnode.width[level] = prevnode.width[level] - steps + prevnode.width[level] = steps + 1 + steps += steps_at_level[level] + for level in range(d, self.maxlevels): + chain[level].width[level] += 1 + self.size += 1 + + def remove(self, value): + # find first node on each level where node.next[levels].value >= value + chain = [None] * self.maxlevels + node = self.head + for level in reversed(range(self.maxlevels)): + while node.next[level].value < value: + node = node.next[level] + chain[level] = node + if value != chain[0].next[0].value: + raise KeyError('Not Found') + + # remove one link at each level + d = len(chain[0].next[0].next) + for level in range(d): + prevnode = chain[level] + prevnode.width[level] += prevnode.next[level].width[level] - 1 + prevnode.next[level] = prevnode.next[level].next[level] + for level in range(d, self.maxlevels): + chain[level].width[level] -= 1 + self.size -= 1 + + def __iter__(self): + 'Iterate over values in sorted order' + node = self.head.next[0] + while node is not NIL: + yield node.value + node = node.next[0] + +from collections import deque +from itertools import islice + +class RunningMedian: + 'Fast running median with O(lg n) updates where n is the window size' + + def __init__(self, n, iterable): + from pandas.lib.skiplist import IndexableSkiplist as skiplist + + self.it = iter(iterable) + self.queue = deque(islice(self.it, n)) + self.skiplist = IndexableSkiplist(n) + for elem in self.queue: + self.skiplist.insert(elem) + + def __iter__(self): + queue = self.queue + skiplist = self.skiplist + midpoint = len(queue) // 2 + yield skiplist[midpoint] + for newelem in self.it: + oldelem = queue.popleft() + skiplist.remove(oldelem) + queue.append(newelem) + skiplist.insert(newelem) + yield skiplist[midpoint] + +N = 100000 +K = 10000 + +import time + +def test(): + from numpy.random import randn + + arr = randn(N) + + def _test(arr, k): + meds = RunningMedian(k, arr) + return list(meds) + + _test(arr, K) + +from numpy.random import randn +from pandas.lib.skiplist import rolling_median + +def test2(): + + arr = randn(N) + + return rolling_median(arr, K) + +def runmany(f, arr, arglist): + timings = [] + + for arg in arglist: + tot = 0 + for i in range(5): + tot += _time(f, arr, arg) + timings.append(tot / 5) + + return timings + +def _time(f, *args): + _start = time.clock() + result = f(*args) + return time.clock() - _start + +if __name__ == '__main__': + test2() diff --git a/scripts/winbuild_py25.bat b/scripts/winbuild_py25.bat new file mode 100644 index 00000000..5ecebab7 --- /dev/null +++ b/scripts/winbuild_py25.bat @@ -0,0 +1,2 @@ +SET PATH=C:\MinGW\bin;C:\Python25;C:\Python25\Scripts;%PATH% +python setup.py build -c mingw32 bdist_wininst diff --git a/scripts/winbuild_py27.bat b/scripts/winbuild_py27.bat new file mode 100644 index 00000000..bec67c7e --- /dev/null +++ b/scripts/winbuild_py27.bat @@ -0,0 +1,2 @@ +SET PATH=C:\MinGW\bin;C:\Python27;C:\Python27\Scripts;%PATH% +python setup.py build -c mingw32 bdist_wininst diff --git a/setup.py b/setup.py new file mode 100755 index 00000000..95d24148 --- /dev/null +++ b/setup.py @@ -0,0 +1,451 @@ +#!/usr/bin/env python + +""" +Parts of this file were taken from the pyzmq project +(https://github.com/zeromq/pyzmq) which have been permitted for use under the +BSD license. Parts are from lxml (https://github.com/lxml/lxml) +""" + +import os +import sys +import shutil +import warnings + +# may need to work around setuptools bug by providing a fake Pyrex +try: + import Cython + sys.path.insert(0, os.path.join(os.path.dirname(__file__), "fake_pyrex")) +except ImportError: + pass + +# try bootstrapping setuptools if it doesn't exist +try: + import pkg_resources + try: + pkg_resources.require("setuptools>=0.6c5") + except pkg_resources.VersionConflict: + from ez_setup import use_setuptools + use_setuptools(version="0.6c5") + from setuptools import setup, Command + _have_setuptools = True +except ImportError: + # no setuptools installed + from distutils.core import setup, Command + _have_setuptools = False + +setuptools_kwargs = {} +if sys.version_info[0] >= 3: + + setuptools_kwargs = {'use_2to3': True, + 'zip_safe': False, + 'install_requires': ['python-dateutil >= 2', + 'pytz', + 'numpy >= 1.4'], + 'use_2to3_exclude_fixers': ['lib2to3.fixes.fix_next', + ], + } + if not _have_setuptools: + sys.exit("need setuptools/distribute for Py3k" + "\n$ pip install distribute") + +else: + setuptools_kwargs = { + 'install_requires': ['python-dateutil < 2', + 'pytz', + 'numpy >= 1.6'], + 'zip_safe' : False, + } + if not _have_setuptools: + try: + import numpy + import dateutil + setuptools_kwargs = {} + except ImportError: + sys.exit("install requires: 'python-dateutil < 2','numpy'." + " use pip or easy_install." + "\n $ pip install 'python-dateutil < 2' 'numpy'") + +try: + import numpy as np +except ImportError: + nonumpy_msg = ("# numpy needed to finish setup. run:\n\n" + " $ pip install numpy # or easy_install numpy\n") + sys.exit(nonumpy_msg) + +if np.__version__ < '1.6.1': + msg = "pandas requires NumPy >= 1.6 due to datetime64 dependency" + sys.exit(msg) + +from distutils.extension import Extension +from distutils.command.build import build +from distutils.command.build_ext import build_ext +from distutils.command.sdist import sdist + +from os.path import splitext, basename, join as pjoin + +DESCRIPTION = ("Powerful data structures for data analysis, time series," + "and statistics") +LONG_DESCRIPTION = """ +**pandas** is a Python package providing fast, flexible, and expressive data +structures designed to make working with structured (tabular, multidimensional, +potentially heterogeneous) and time series data both easy and intuitive. It +aims to be the fundamental high-level building block for doing practical, +**real world** data analysis in Python. Additionally, it has the broader goal +of becoming **the most powerful and flexible open source data analysis / +manipulation tool available in any language**. It is already well on its way +toward this goal. + +pandas is well suited for many different kinds of data: + + - Tabular data with heterogeneously-typed columns, as in an SQL table or + Excel spreadsheet + - Ordered and unordered (not necessarily fixed-frequency) time series data. + - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and + column labels + - Any other form of observational / statistical data sets. The data actually + need not be labeled at all to be placed into a pandas data structure + +The two primary data structures of pandas, Series (1-dimensional) and DataFrame +(2-dimensional), handle the vast majority of typical use cases in finance, +statistics, social science, and many areas of engineering. For R users, +DataFrame provides everything that R's ``data.frame`` provides and much +more. pandas is built on top of `NumPy `__ and is +intended to integrate well within a scientific computing environment with many +other 3rd party libraries. + +Here are just a few of the things that pandas does well: + + - Easy handling of **missing data** (represented as NaN) in floating point as + well as non-floating point data + - Size mutability: columns can be **inserted and deleted** from DataFrame and + higher dimensional objects + - Automatic and explicit **data alignment**: objects can be explicitly + aligned to a set of labels, or the user can simply ignore the labels and + let `Series`, `DataFrame`, etc. automatically align the data for you in + computations + - Powerful, flexible **group by** functionality to perform + split-apply-combine operations on data sets, for both aggregating and + transforming data + - Make it **easy to convert** ragged, differently-indexed data in other + Python and NumPy data structures into DataFrame objects + - Intelligent label-based **slicing**, **fancy indexing**, and **subsetting** + of large data sets + - Intuitive **merging** and **joining** data sets + - Flexible **reshaping** and pivoting of data sets + - **Hierarchical** labeling of axes (possible to have multiple labels per + tick) + - Robust IO tools for loading data from **flat files** (CSV and delimited), + Excel files, databases, and saving / loading data from the ultrafast **HDF5 + format** + - **Time series**-specific functionality: date range generation and frequency + conversion, moving window statistics, moving window linear regressions, + date shifting and lagging, etc. + +Many of these principles are here to address the shortcomings frequently +experienced using other languages / scientific research environments. For data +scientists, working with data is typically divided into multiple stages: +munging and cleaning data, analyzing / modeling it, then organizing the results +of the analysis into a form suitable for plotting or tabular display. pandas is +the ideal tool for all of these tasks. + +Note +---- +Windows binaries built against NumPy 1.6.1 +""" + +DISTNAME = 'pandas' +LICENSE = 'BSD' +AUTHOR = "The PyData Development Team" +EMAIL = "pydata@googlegroups.com" +URL = "http://pandas.pydata.org" +DOWNLOAD_URL = '' +CLASSIFIERS = [ + 'Development Status :: 4 - Beta', + 'Environment :: Console', + 'Operating System :: OS Independent', + 'Intended Audience :: Science/Research', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Programming Language :: Cython', + 'Topic :: Scientific/Engineering', +] + +MAJOR = 0 +MINOR = 8 +MICRO = 0 +ISRELEASED = True +VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) +QUALIFIER = '' + +FULLVERSION = VERSION +if not ISRELEASED: + FULLVERSION += '.dev' + try: + import subprocess + try: + pipe = subprocess.Popen(["git", "rev-parse", "--short", "HEAD"], + stdout=subprocess.PIPE).stdout + except OSError: + # msysgit compatibility + pipe = subprocess.Popen(["git.cmd", "rev-parse", "--short", "HEAD"], + stdout=subprocess.PIPE).stdout + rev = pipe.read().strip() + # makes distutils blow up on Python 2.7 + if sys.version_info[0] >= 3: + rev = rev.decode('ascii') + + FULLVERSION += "-%s" % rev + except: + warnings.warn("WARNING: Couldn't get git revision") +else: + FULLVERSION += QUALIFIER + +def write_version_py(filename=None): + cnt = """\ +version = '%s' +short_version = '%s' +""" + if not filename: + filename = os.path.join(os.path.dirname(__file__), 'pandas', 'version.py') + + a = open(filename, 'w') + try: + a.write(cnt % (FULLVERSION, VERSION)) + finally: + a.close() + +class CleanCommand(Command): + """Custom distutils command to clean the .so and .pyc files.""" + + user_options = [("all", "a", "") ] + + def initialize_options(self): + self.all = True + self._clean_me = [] + self._clean_trees = [] + self._clean_exclude = ['np_datetime.c', + 'np_datetime_strings.c', + 'period.c'] + + for root, dirs, files in list(os.walk('pandas')): + for f in files: + if f in self._clean_exclude: + continue + if os.path.splitext(f)[-1] in ('.pyc', '.so', '.o', + '.pyd', '.c'): + self._clean_me.append(pjoin(root, f)) + for d in dirs: + if d == '__pycache__': + self._clean_trees.append(pjoin(root, d)) + + for d in ('build',): + if os.path.exists(d): + self._clean_trees.append(d) + + def finalize_options(self): + pass + + def run(self): + for clean_me in self._clean_me: + try: + os.unlink(clean_me) + except Exception: + pass + for clean_tree in self._clean_trees: + try: + shutil.rmtree(clean_tree) + except Exception: + pass + +class CheckSDist(sdist): + """Custom sdist that ensures Cython has compiled all pyx files to c.""" + + _pyxfiles = ['pandas/src/tseries.pyx' + 'pandas/src/sparse.pyx'] + + def initialize_options(self): + sdist.initialize_options(self) + + ''' + self._pyxfiles = [] + for root, dirs, files in os.walk('pandas'): + for f in files: + if f.endswith('.pyx'): + self._pyxfiles.append(pjoin(root, f)) + ''' + + def run(self): + if 'cython' in cmdclass: + self.run_command('cython') + else: + for pyxfile in self._pyxfiles: + cfile = pyxfile[:-3]+'c' + msg = "C-source file '%s' not found."%(cfile)+\ + " Run 'setup.py cython' before sdist." + assert os.path.isfile(cfile), msg + sdist.run(self) + +class CheckingBuildExt(build_ext): + """Subclass build_ext to get clearer report if Cython is necessary.""" + + def check_cython_extensions(self, extensions): + for ext in extensions: + for src in ext.sources: + if not os.path.exists(src): + raise Exception("""Cython-generated file '%s' not found. + Cython is required to compile pandas from a development branch. + Please install Cython or download a release package of pandas. + """ % src) + + def build_extensions(self): + self.check_cython_extensions(self.extensions) + self.check_extensions_list(self.extensions) + + for ext in self.extensions: + self.build_extension(ext) + +cmdclass = {'clean': CleanCommand, + 'build': build} + +try: + from Cython.Distutils import build_ext + #from Cython.Distutils import Extension # to get pyrex debugging symbols + cython=True +except ImportError: + cython=False + suffix = '.c' + cmdclass['build_ext'] = CheckingBuildExt +else: + suffix = '.pyx' + class CythonCommand(build_ext): + """Custom distutils command subclassed from Cython.Distutils.build_ext + to compile pyx->c, and stop there. All this does is override the + C-compile method build_extension() with a no-op.""" + def build_extension(self, ext): + pass + + class DummyBuildSrc(Command): + """ numpy's build_src command interferes with Cython's build_ext. + """ + user_options = [] + def initialize_options(self): + self.py_modules_dict = {} + def finalize_options(self): + pass + def run(self): + pass + + cmdclass['build_src'] = DummyBuildSrc + cmdclass['cython'] = CythonCommand + cmdclass['build_ext'] = build_ext + cmdclass['sdist'] = CheckSDist + +tseries_depends = ['reindex', 'groupby', 'skiplist', 'moments', + 'reduce', 'stats', 'datetime', + 'hashtable', 'inference', 'properties', 'join', 'engines'] + +plib_depends = ['plib'] + +def srcpath(name=None, suffix='.pyx', subdir='src'): + return pjoin('pandas', subdir, name+suffix) + +if suffix == '.pyx': + tseries_depends = [srcpath(f, suffix='.pyx') + for f in tseries_depends] + tseries_depends.append('pandas/src/util.pxd') + plib_depends = [srcpath(f, suffix='.pyx') + for f in plib_depends] + plib_depends.append('pandas/src/util.pxd') +else: + tseries_depends = [] + plib_depends = [] + +algos_ext = Extension('pandas._algos', + sources=[srcpath('generated', suffix=suffix)], + include_dirs=[np.get_include()], + ) + +lib_ext = Extension('pandas.lib', + depends=tseries_depends + ['pandas/src/numpy_helper.h'], + sources=[srcpath('tseries', suffix=suffix), + 'pandas/src/datetime/np_datetime.c', + 'pandas/src/datetime/np_datetime_strings.c'], + include_dirs=[np.get_include()], + # pyrex_gdb=True, + # extra_compile_args=['-Wconversion'] + ) + +period_ext = Extension('pandas._period', + depends=plib_depends + ['pandas/src/numpy_helper.h', + 'pandas/src/period.h'], + sources=[srcpath('plib', suffix=suffix), + 'pandas/src/datetime/np_datetime.c', + 'pandas/src/period.c'], + include_dirs=[np.get_include()]) + + +sparse_ext = Extension('pandas._sparse', + sources=[srcpath('sparse', suffix=suffix)], + include_dirs=[np.get_include()]) + +sandbox_ext = Extension('pandas._sandbox', + sources=[srcpath('sandbox', suffix=suffix)], + include_dirs=[np.get_include()]) + +cppsandbox_ext = Extension('pandas._cppsandbox', + language='c++', + sources=[srcpath('cppsandbox', suffix=suffix)], + include_dirs=[np.get_include()]) + +extensions = [algos_ext, lib_ext, period_ext, sparse_ext] + +if not ISRELEASED: + extensions.extend([sandbox_ext]) + +# if _have_setuptools: +# setuptools_kwargs["test_suite"] = "nose.collector" + +write_version_py() +setup(name=DISTNAME, + version=FULLVERSION, + maintainer=AUTHOR, + packages=['pandas', + 'pandas.compat', + 'pandas.core', + 'pandas.io', + 'pandas.rpy', + 'pandas.sandbox', + 'pandas.sparse', + 'pandas.sparse.tests', + 'pandas.stats', + 'pandas.util', + 'pandas.tests', + 'pandas.tools', + 'pandas.tools.tests', + 'pandas.tseries', + 'pandas.tseries.tests', + 'pandas.io.tests', + 'pandas.stats.tests', + ], + package_data={'pandas.io' : ['tests/*.h5', + 'tests/*.csv', + 'tests/*.xls', + 'tests/*.xlsx', + 'tests/*.table'], + 'pandas.tests' : ['data/*.pickle', + 'data/*.csv'], + 'pandas.tseries.tests' : ['data/*.pickle', + 'data/*.csv'] + }, + ext_modules=extensions, + maintainer_email=EMAIL, + description=DESCRIPTION, + license=LICENSE, + cmdclass = cmdclass, + url=URL, + download_url=DOWNLOAD_URL, + long_description=LONG_DESCRIPTION, + classifiers=CLASSIFIERS, + platforms='any', + **setuptools_kwargs) diff --git a/test.sh b/test.sh new file mode 100755 index 00000000..324ac68d --- /dev/null +++ b/test.sh @@ -0,0 +1,10 @@ +#!/bin/sh +coverage erase +# nosetests pandas/tests/test_index.py --with-coverage --cover-package=pandas.core --pdb-failure --pdb +#nosetests -w pandas --with-coverage --cover-package=pandas --pdb-failure --pdb #--cover-inclusive +#nosetests -A "not slow" -w pandas/tseries --with-coverage --cover-package=pandas.tseries $* #--cover-inclusive +nosetests -w pandas --with-coverage --cover-package=pandas $* +# nosetests -w pandas/io --with-coverage --cover-package=pandas.io --pdb-failure --pdb +# nosetests -w pandas/core --with-coverage --cover-package=pandas.core --pdb-failure --pdb +# nosetests -w pandas/stats --with-coverage --cover-package=pandas.stats +# coverage run runtests.py diff --git a/test_fast.sh b/test_fast.sh new file mode 100755 index 00000000..830443dc --- /dev/null +++ b/test_fast.sh @@ -0,0 +1 @@ +nosetests -A "not slow" pandas $* \ No newline at end of file diff --git a/test_rebuild.sh b/test_rebuild.sh new file mode 100755 index 00000000..d3710c5f --- /dev/null +++ b/test_rebuild.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +python setup.py clean +python setup.py build_ext --inplace +coverage erase +# nosetests pandas/tests/test_index.py --with-coverage --cover-package=pandas.core --pdb-failure --pdb +#nosetests -w pandas --with-coverage --cover-package=pandas --pdb-failure --pdb #--cover-inclusive +nosetests -w pandas --with-coverage --cover-package=pandas $* #--cover-inclusive +# nosetests -w pandas/io --with-coverage --cover-package=pandas.io --pdb-failure --pdb +# nosetests -w pandas/core --with-coverage --cover-package=pandas.core --pdb-failure --pdb +# nosetests -w pandas/stats --with-coverage --cover-package=pandas.stats +# coverage run runtests.py diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..2f5f998b --- /dev/null +++ b/tox.ini @@ -0,0 +1,39 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py25, py26, py27, py31, py32 + +[testenv] +commands = + {envpython} setup.py clean build_ext install + {envbindir}/nosetests tests + rm -rf {toxinidir}/build {toxinidir}/tests +deps = + cython + numpy >= 1.6.1 + nose + pytz + +[testenv:py25] +changedir = .tox/py25/lib/python2.5/site-packages/pandas +deps = + cython + numpy >= 1.6.1 + nose + pytz + simplejson + +[testenv:py26] +changedir = .tox/py26/lib/python2.6/site-packages/pandas + +[testenv:py27] +changedir = .tox/py27/lib/python2.7/site-packages/pandas + +[testenv:py31] +changedir = .tox/py31/lib/python3.1/site-packages/pandas + +[testenv:py32] +changedir = .tox/py32/lib/python3.2/site-packages/pandas diff --git a/ts_todo.txt b/ts_todo.txt new file mode 100644 index 00000000..c2202530 --- /dev/null +++ b/ts_todo.txt @@ -0,0 +1,20 @@ +- DatetimeIndex.union empty -> error +- DateOffset display sucks -> 1 Day + +gp- get rid of Ts class, simplify timestamp creation +- rename tzinfo to tz +- get rid of deprecated / new offset mapping, will deal with users +- add Index.is_unique, check via is_monotonic cython routine +- tofreq -> asfreq again +- work around numpy refcount bug in Index.append with datetime64 type promotion +- fix bug in Index.join with empty indexes +- handle legacy DateRange unpickling in BlockManager +- date_range/bdate_range factory functions, test refactoring +- got rid of all deprecation warnings in test suite, usages of DateRange +- fix merge issue? in generate_code.py +- attach tz in DatetimeIndex.asobject +- failing duplicate timestamp test +- _tseries.pyd depends on datetime.pyx + + +- BUG: time_rule DateRange tests diff --git a/vb_suite/.gitignore b/vb_suite/.gitignore new file mode 100644 index 00000000..cc110f04 --- /dev/null +++ b/vb_suite/.gitignore @@ -0,0 +1,4 @@ +benchmarks.db +build/* +source/vbench/* +source/*.rst \ No newline at end of file diff --git a/vb_suite/attrs_caching.py b/vb_suite/attrs_caching.py new file mode 100644 index 00000000..e196546e --- /dev/null +++ b/vb_suite/attrs_caching.py @@ -0,0 +1,20 @@ +from vbench.benchmark import Benchmark + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# DataFrame.index / columns property lookup time + +setup = common_setup + """ +df = DataFrame(np.random.randn(10, 6)) +cur_index = df.index +""" +stmt = "foo = df.index" + +getattr_dataframe_index = Benchmark(stmt, setup, + name="getattr_dataframe_index") + +stmt = "df.index = cur_index" +setattr_dataframe_index = Benchmark(stmt, setup, + name="setattr_dataframe_index") diff --git a/vb_suite/binary_ops.py b/vb_suite/binary_ops.py new file mode 100644 index 00000000..080ed3ff --- /dev/null +++ b/vb_suite/binary_ops.py @@ -0,0 +1,26 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# data alignment + +setup = common_setup + """n = 1000000 +# indices = Index([rands(10) for _ in xrange(n)]) +def sample(values, k): + sampler = np.random.permutation(len(values)) + return values.take(sampler[:k]) +sz = 500000 +rng = np.arange(0, 10000000000000, 10000000) +stamps = np.datetime64(datetime.now()).view('i8') + rng +idx1 = np.sort(sample(stamps, sz)) +idx2 = np.sort(sample(stamps, sz)) +ts1 = Series(np.random.randn(sz), idx1) +ts2 = Series(np.random.randn(sz), idx2) +""" +stmt = "ts1 + ts2" +series_align_int64_index = Benchmark(stmt, setup, + start_date=datetime(2010, 6, 1), + logy=True) diff --git a/vb_suite/ctors.py b/vb_suite/ctors.py new file mode 100644 index 00000000..365af021 --- /dev/null +++ b/vb_suite/ctors.py @@ -0,0 +1,17 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# Series constructors + +setup = common_setup + """ +data = np.random.randn(100) +index = Index(np.arange(100)) +""" + +series_constructor_ndarray = \ + Benchmark("Series(data, index=index)", setup=setup, + name='series_constructor_ndarray') diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py new file mode 100644 index 00000000..f8d6e5d5 --- /dev/null +++ b/vb_suite/frame_ctor.py @@ -0,0 +1,52 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# Creation from nested dict + +setup = common_setup + """ +N, K = 5000, 50 +index = [rands(10) for _ in xrange(N)] +columns = [rands(10) for _ in xrange(K)] +frame = DataFrame(np.random.randn(N, K), index=index, columns=columns) + +try: + data = frame.to_dict() +except: + data = frame.toDict() + +some_dict = data.values()[0] +dict_list = [dict(zip(columns, row)) for row in frame.values] +""" + +frame_ctor_nested_dict = Benchmark("DataFrame(data)", setup) + +# From JSON-like stuff + +frame_ctor_list_of_dict = Benchmark("DataFrame(dict_list)", setup, + start_date=datetime(2011, 12, 20)) + +series_ctor_from_dict = Benchmark("Series(some_dict)", setup) + +# nested dict, integer indexes, regression described in #621 + +setup = common_setup + """ +data = dict((i,dict((j,float(j)) for j in xrange(100))) for i in xrange(2000)) +""" +frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup) + +#---------------------------------------------------------------------- +# get_numeric_data + +setup = common_setup + """ +df = DataFrame(randn(10000, 25)) +df['foo'] = 'bar' +df['bar'] = 'baz' +df = df.consolidate() +""" + +frame_get_numeric_data = Benchmark('df._get_numeric_data()', setup, + start_date=datetime(2011, 11, 1)) diff --git a/vb_suite/frame_methods.py b/vb_suite/frame_methods.py new file mode 100644 index 00000000..08f041d8 --- /dev/null +++ b/vb_suite/frame_methods.py @@ -0,0 +1,67 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# lookup + +setup = common_setup + """ +df = DataFrame(np.random.randn(10000, 8), columns=list('abcdefgh')) +df['foo'] = 'bar' + +row_labels = list(df.index[::10])[:900] +col_labels = list(df.columns) * 100 +row_labels_all = list(df.index) * len(df.columns) +col_labels_all = list(df.columns) * len(df.index) +""" + +frame_fancy_lookup = Benchmark('df.lookup(row_labels, col_labels)', setup, + start_date=datetime(2012, 1, 12)) + +frame_fancy_lookup_all = Benchmark('df.lookup(row_labels_all, col_labels_all)', + setup, + start_date=datetime(2012, 1, 12)) + +#---------------------------------------------------------------------- +# fillna in place + +setup = common_setup + """ +df = DataFrame(randn(10000, 100)) +df.values[::2] = np.nan +""" + +frame_fillna_inplace = Benchmark('df.fillna(0, inplace=True)', setup, + start_date=datetime(2012, 4, 4)) + + +#---------------------------------------------------------------------- +# reindex both axes + +setup = common_setup + """ +df = DataFrame(randn(1000, 1000)) +idx = range(400, 700) +""" + +frame_reindex_axis0 = Benchmark('df.reindex(idx)', setup) + +frame_reindex_axis1 = Benchmark('df.reindex(columns=idx)', setup) + +frame_reindex_both_axes = Benchmark('df.reindex(index=idx, columns=idx)', + setup, start_date=datetime(2011, 1, 1)) + +frame_reindex_both_axes_ix = Benchmark('df.ix[idx, idx]', setup, + start_date=datetime(2011, 1, 1)) + +#---------------------------------------------------------------------- +# boolean indexing + +setup = common_setup + """ +df = DataFrame(randn(10000, 100)) +bool_arr = np.zeros(10000, dtype=bool) +bool_arr[:1000] = True +""" + +frame_boolean_row_select = Benchmark('df[bool_arr]', setup, + start_date=datetime(2011, 1, 1)) diff --git a/vb_suite/generate_rst_files.py b/vb_suite/generate_rst_files.py new file mode 100644 index 00000000..92e7cd4d --- /dev/null +++ b/vb_suite/generate_rst_files.py @@ -0,0 +1,2 @@ +from suite import benchmarks, generate_rst_files +generate_rst_files(benchmarks) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py new file mode 100644 index 00000000..f690135e --- /dev/null +++ b/vb_suite/groupby.py @@ -0,0 +1,194 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +N = 100000 +ngroups = 100 + +def get_test_data(ngroups=100, n=N): + unique_groups = range(ngroups) + arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object) + + if len(arr) < n: + arr = np.asarray(list(arr) + unique_groups[:n - len(arr)], + dtype=object) + + random.shuffle(arr) + return arr + +# aggregate multiple columns +df = DataFrame({'key1' : get_test_data(ngroups=ngroups), + 'key2' : get_test_data(ngroups=ngroups), + 'data1' : np.random.randn(N), + 'data2' : np.random.randn(N)}) +def f(): + df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum()) + +simple_series = Series(np.random.randn(N)) +key1 = df['key1'] +""" + +stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())" +groupby_multi_python = Benchmark(stmt1, setup, + start_date=datetime(2011, 7, 1)) + +stmt3 = "df.groupby(['key1', 'key2']).sum()" +groupby_multi_cython = Benchmark(stmt3, setup, + start_date=datetime(2011, 7, 1)) + +stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)" +groupby_multi_series_op = Benchmark(stmt, setup, + start_date=datetime(2011, 8, 1)) + +groupby_series_simple_cython = \ + Benchmark('simple_series.groupby(key1).sum()', setup, + start_date=datetime(2011, 3, 1)) + +#---------------------------------------------------------------------- +# 2d grouping, aggregate many columns + +setup = common_setup + """ +labels = np.random.randint(0, 100, size=1000) +df = DataFrame(randn(1000, 1000)) +""" + +groupby_frame_cython_many_columns = Benchmark('df.groupby(labels).sum()', setup, + start_date=datetime(2011, 8, 1), + logy=True) + +#---------------------------------------------------------------------- +# single key, long, integer key + +setup = common_setup + """ +data = np.random.randn(100000, 1) +labels = np.random.randint(0, 1000, size=100000) +df = DataFrame(data) +""" + +groupby_frame_singlekey_integer = \ + Benchmark('df.groupby(labels).sum()', setup, + start_date=datetime(2011, 8, 1), logy=True) + +#---------------------------------------------------------------------- +# group with different functions per column + +setup = common_setup + """ +fac1 = np.array(['A', 'B', 'C'], dtype='O') +fac2 = np.array(['one', 'two'], dtype='O') + +df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=100000)), + 'key2': fac2.take(np.random.randint(0, 2, size=100000)), + 'value1' : np.random.randn(100000), + 'value2' : np.random.randn(100000), + 'value3' : np.random.randn(100000)}) +""" + +groupby_multi_different_functions = \ + Benchmark("""df.groupby(['key1', 'key2']).agg({'value1' : 'mean', + 'value2' : 'var', + 'value3' : 'sum'})""", + setup, start_date=datetime(2011, 9, 1)) + +groupby_multi_different_numpy_functions = \ + Benchmark("""df.groupby(['key1', 'key2']).agg({'value1' : np.mean, + 'value2' : np.var, + 'value3' : np.sum})""", + setup, start_date=datetime(2011, 9, 1)) + +#---------------------------------------------------------------------- +# size() speed + +setup = common_setup + """ +df = DataFrame({'key1': np.random.randint(0, 500, size=100000), + 'key2': np.random.randint(0, 100, size=100000), + 'value1' : np.random.randn(100000), + 'value2' : np.random.randn(100000), + 'value3' : np.random.randn(100000)}) +""" + +groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()", + setup, start_date=datetime(2011, 10, 1)) + +#---------------------------------------------------------------------- +# Series.value_counts + +setup = common_setup + """ +s = Series(np.random.randint(0, 1000, size=100000)) +""" + +series_value_counts_int64 = Benchmark('s.value_counts()', setup, + start_date=datetime(2011, 10, 21)) + +#---------------------------------------------------------------------- +# pivot_table + +setup = common_setup + """ +fac1 = np.array(['A', 'B', 'C'], dtype='O') +fac2 = np.array(['one', 'two'], dtype='O') + +ind1 = np.random.randint(0, 3, size=100000) +ind2 = np.random.randint(0, 2, size=100000) + +df = DataFrame({'key1': fac1.take(ind1), + 'key2': fac2.take(ind2), + 'key3': fac2.take(ind2), + 'value1' : np.random.randn(100000), + 'value2' : np.random.randn(100000), + 'value3' : np.random.randn(100000)}) +""" + +stmt = "df.pivot_table(rows='key1', cols=['key2', 'key3'])" +groupby_pivot_table = Benchmark(stmt, setup, start_date=datetime(2011, 12, 15)) + + +#---------------------------------------------------------------------- +# dict return values + +setup = common_setup + """ +labels = np.arange(1000).repeat(10) +data = Series(randn(len(labels))) +f = lambda x: {'first': x.values[0], 'last': x.values[-1]} +""" + +groupby_apply_dict_return = Benchmark('data.groupby(labels).apply(f)', + setup, start_date=datetime(2011, 12, 15)) + +#---------------------------------------------------------------------- +# First / last functions + +setup = common_setup + """ +labels = np.arange(10000).repeat(10) +data = Series(randn(len(labels))) +data[::3] = np.nan +data[1::3] = np.nan +labels = labels.take(np.random.permutation(len(labels))) +""" + +groupby_first = Benchmark('data.groupby(labels).first()', setup, + start_date=datetime(2012, 5, 1)) + +groupby_last = Benchmark('data.groupby(labels).last()', setup, + start_date=datetime(2012, 5, 1)) + + +#---------------------------------------------------------------------- +# groupby_indices replacement, chop up Series + +setup = common_setup + """ +try: + rng = date_range('1/1/2000', '12/31/2005', freq='H') + year, month, day = rng.year, rng.month, rng.day +except: + rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour()) + year = rng.map(lambda x: x.year) + month = rng.map(lambda x: x.month) + day = rng.map(lambda x: x.day) + +ts = Series(np.random.randn(len(rng)), index=rng) +""" + +groupby_indices = Benchmark('len(ts.groupby([year, month, day]))', + setup, start_date=datetime(2012, 1, 1)) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py new file mode 100644 index 00000000..819a81a5 --- /dev/null +++ b/vb_suite/index_object.py @@ -0,0 +1,35 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +SECTION = "Index / MultiIndex objects" + + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# intersection, union + +setup = common_setup + """ +rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute()) +rng = rng.view(Index) +rng2 = rng[:-1] +""" + +index_datetime_intersection = Benchmark("rng.intersection(rng2)", setup) +index_datetime_union = Benchmark("rng.union(rng2)", setup) + +# integers +setup = common_setup + """ +N = 1000000 +options = np.arange(N) + +left = Index(options.take(np.random.permutation(N)[:N // 2])) +right = Index(options.take(np.random.permutation(N)[:N // 2])) +""" + +index_int64_union = Benchmark('left.union(right)', setup, + start_date=datetime(2011, 1, 1)) + +index_int64_intersection = Benchmark('left.intersection(right)', setup, + start_date=datetime(2011, 1, 1)) diff --git a/vb_suite/indexing.py b/vb_suite/indexing.py new file mode 100644 index 00000000..3c10a155 --- /dev/null +++ b/vb_suite/indexing.py @@ -0,0 +1,109 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +SECTION = 'Indexing and scalar value access' + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# Series.__getitem__, get_value + +setup = common_setup + """ +tm.N = 1000 +ts = tm.makeTimeSeries() +dt = ts.index[500] +""" +statement = "ts[dt]" + +bm_getitem = Benchmark(statement, setup, ncalls=100000, + name='series_getitem_scalar') + +setup = common_setup + """ +index = [tm.rands(10) for _ in xrange(1000)] +s = Series(np.random.rand(1000), index=index) +idx = index[100] +""" +statement = "s.get_value(idx)" +bm_df_getitem3 = Benchmark(statement, setup, + name='series_get_value', + start_date=datetime(2011, 11, 12)) + +#---------------------------------------------------------------------- +# DataFrame __getitem__ + +setup = common_setup + """ +index = [tm.rands(10) for _ in xrange(1000)] +columns = [tm.rands(10) for _ in xrange(30)] +df = DataFrame(np.random.rand(1000, 30), index=index, + columns=columns) +idx = index[100] +col = columns[10] +""" +statement = "df[col][idx]" +bm_df_getitem = Benchmark(statement, setup, + name='dataframe_getitem_scalar') + +setup = common_setup + """ +try: + klass = DataMatrix +except: + klass = DataFrame + +index = [tm.rands(10) for _ in xrange(1000)] +columns = [tm.rands(10) for _ in xrange(30)] +df = klass(np.random.rand(1000, 30), index=index, + columns=columns) +idx = index[100] +col = columns[10] +""" +statement = "df[col][idx]" +bm_df_getitem2 = Benchmark(statement, setup, + name='datamatrix_getitem_scalar') + +setup = common_setup + """ +try: + klass = DataMatrix +except: + klass = DataFrame + +index = [tm.rands(10) for _ in xrange(1000)] +columns = [tm.rands(10) for _ in xrange(30)] +df = klass(np.random.rand(1000, 30), index=index, + columns=columns) +idx = index[100] +col = columns[10] +""" +statement = "df.get_value(idx, col)" +bm_df_getitem3 = Benchmark(statement, setup, + name='dataframe_get_value', + start_date=datetime(2011, 11, 12)) + +#---------------------------------------------------------------------- +# Boolean DataFrame row selection + +setup = common_setup + """ +df = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) +indexer = df['B'] > 0 +obj_indexer = indexer.astype('O') +""" +indexing_dataframe_boolean_rows = \ + Benchmark("df[indexer]", setup, name='indexing_dataframe_boolean_rows') + +indexing_dataframe_boolean_rows_object = \ + Benchmark("df[obj_indexer]", setup, + name='indexing_dataframe_boolean_rows_object') + +#---------------------------------------------------------------------- +# MultiIndex sortlevel + +setup = common_setup + """ +a = np.repeat(np.arange(100), 1000) +b = np.tile(np.arange(1000), 100) +midx = MultiIndex.from_arrays([a, b]) +midx = midx.take(np.random.permutation(np.arange(100000))) +""" +sort_level_zero = Benchmark("midx.sortlevel(0)", setup, + start_date=datetime(2012,1,1)) +sort_level_one = Benchmark("midx.sortlevel(1)", setup, + start_date=datetime(2012,1,1)) diff --git a/vb_suite/io_bench.py b/vb_suite/io_bench.py new file mode 100644 index 00000000..d421466d --- /dev/null +++ b/vb_suite/io_bench.py @@ -0,0 +1,46 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# read_csv + +setup1 = common_setup + """ +index = [rands(10) for _ in xrange(10000)] +df = DataFrame({'float1' : randn(10000), + 'float2' : randn(10000), + 'string1' : ['foo'] * 10000, + 'bool1' : [True] * 10000, + 'int1' : np.random.randint(0, 100000, size=10000)}, + index=index) +df.to_csv('__test__.csv') +""" + +read_csv_standard = Benchmark("read_csv('__test__.csv')", setup1, + start_date=datetime(2011, 9, 15)) + + +#---------------------------------------------------------------------- +# write_csv + +setup2 = common_setup + """ +index = [rands(10) for _ in xrange(10000)] +df = DataFrame({'float1' : randn(10000), + 'float2' : randn(10000), + 'string1' : ['foo'] * 10000, + 'bool1' : [True] * 10000, + 'int1' : np.random.randint(0, 100000, size=10000)}, + index=index) +""" + +write_csv_standard = Benchmark("df.to_csv('__test__.csv')", setup2, + start_date=datetime(2011, 9, 15)) + +#---------------------------------- +setup = common_setup + """ +df = DataFrame(np.random.randn(3000, 30)) +""" +frame_to_csv = Benchmark("df.to_csv('__test__.csv')", setup, + start_date=datetime(2011, 1, 1)) diff --git a/vb_suite/join_merge.py b/vb_suite/join_merge.py new file mode 100644 index 00000000..b568555c --- /dev/null +++ b/vb_suite/join_merge.py @@ -0,0 +1,169 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +level1 = np.array([rands(10) for _ in xrange(10)], dtype='O') +level2 = np.array([rands(10) for _ in xrange(1000)], dtype='O') +label1 = np.arange(10).repeat(1000) +label2 = np.tile(np.arange(1000), 10) + +key1 = np.tile(level1.take(label1), 10) +key2 = np.tile(level2.take(label2), 10) + +shuf = np.arange(100000) +random.shuffle(shuf) +try: + index2 = MultiIndex(levels=[level1, level2], labels=[label1, label2]) + index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) + df_multi = DataFrame(np.random.randn(len(index2), 4), index=index2, + columns=['A', 'B', 'C', 'D']) +except: # pre-MultiIndex + pass + +try: + DataFrame = DataMatrix +except: + pass + +df = DataFrame({'data1' : np.random.randn(100000), + 'data2' : np.random.randn(100000), + 'key1' : key1, + 'key2' : key2}) + + +df_key1 = DataFrame(np.random.randn(len(level1), 4), index=level1, + columns=['A', 'B', 'C', 'D']) +df_key2 = DataFrame(np.random.randn(len(level2), 4), index=level2, + columns=['A', 'B', 'C', 'D']) + +df_shuf = df.reindex(df.index[shuf]) +""" + +#---------------------------------------------------------------------- +# DataFrame joins on key + +join_dataframe_index_single_key_small = \ + Benchmark("df.join(df_key1, on='key1')", setup, + name='join_dataframe_index_single_key_small') + +join_dataframe_index_single_key_bigger = \ + Benchmark("df.join(df_key2, on='key2')", setup, + name='join_dataframe_index_single_key_bigger') + +join_dataframe_index_single_key_bigger_sort = \ + Benchmark("df_shuf.join(df_key2, on='key2', sort=True)", setup, + name='join_dataframe_index_single_key_bigger', + start_date=datetime(2012, 2, 5)) + +join_dataframe_index_multi = \ + Benchmark("df.join(df_multi, on=['key1', 'key2'])", setup, + name='join_dataframe_index_multi', + start_date=datetime(2011, 10, 20)) + +#---------------------------------------------------------------------- +# Joins on integer keys + +join_dataframe_integer_key = Benchmark("merge(df, df2, on='key')", setup, + start_date=datetime(2011, 10, 20)) + +#---------------------------------------------------------------------- +# DataFrame joins on index + + + +#---------------------------------------------------------------------- +# Merges + +#---------------------------------------------------------------------- +# Appending DataFrames + +setup = common_setup + """ +df1 = DataFrame(np.random.randn(10000, 4), columns=['A', 'B', 'C', 'D']) +df2 = df1.copy() +df2.index = np.arange(10000, 20000) +mdf1 = df1.copy() +mdf1['obj1'] = 'bar' +mdf1['obj2'] = 'bar' +mdf1['int1'] = 5 +try: + mdf1.consolidate(inplace=True) +except: + pass +mdf2 = mdf1.copy() +mdf2.index = df2.index +""" + +stmt = "df1.append(df2)" +append_frame_single_homogenous = \ + Benchmark(stmt, setup, name='append_frame_single_homogenous', + ncalls=500, repeat=1) + +stmt = "mdf1.append(mdf2)" +append_frame_single_mixed = Benchmark(stmt, setup, + name='append_frame_single_mixed', + ncalls=500, repeat=1) + +#---------------------------------------------------------------------- +# data alignment + +setup = common_setup + """n = 1000000 +# indices = Index([rands(10) for _ in xrange(n)]) +def sample(values, k): + sampler = np.random.permutation(len(values)) + return values.take(sampler[:k]) +sz = 500000 +rng = np.arange(0, 10000000000000, 10000000) +stamps = np.datetime64(datetime.now()).view('i8') + rng +idx1 = np.sort(sample(stamps, sz)) +idx2 = np.sort(sample(stamps, sz)) +ts1 = Series(np.random.randn(sz), idx1) +ts2 = Series(np.random.randn(sz), idx2) +""" +stmt = "ts1 + ts2" +series_align_int64_index = \ + Benchmark(stmt, setup, + name="series_align_int64_index", + start_date=datetime(2010, 6, 1), logy=True) + +stmt = "ts1.align(ts2, join='left')" +series_align_left_monotonic = \ + Benchmark(stmt, setup, + name="series_align_left_monotonic", + start_date=datetime(2011, 12, 1), logy=True) + +#---------------------------------------------------------------------- +# Concat Series axis=1 + +setup = common_setup + """ +n = 1000 +indices = Index([rands(10) for _ in xrange(1000)]) +s = Series(n, index=indices) +pieces = [s[i:-i] for i in range(1, 10)] +pieces = pieces * 50 +""" + +concat_series_axis1 = Benchmark('concat(pieces, axis=1)', setup, + start_date=datetime(2012, 2, 27)) + +#---------------------------------------------------------------------- +# Ordered merge + +setup = common_setup + """ +groups = np.array([rands(10) for _ in xrange(10)], dtype='O') + +left = DataFrame({'group': groups.repeat(5000), + 'key' : np.tile(np.arange(0, 10000, 2), 10), + 'lvalue': np.random.randn(50000)}) + +right = DataFrame({'key' : np.arange(10000), + 'rvalue' : np.random.randn(10000)}) + +""" + +stmt = "ordered_merge(left, right, on='key', left_by='group')" diff --git a/vb_suite/make.py b/vb_suite/make.py new file mode 100755 index 00000000..c97b9c92 --- /dev/null +++ b/vb_suite/make.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python + +""" +Python script for building documentation. + +To build the docs you must have all optional dependencies for statsmodels +installed. See the installation instructions for a list of these. + +Note: currently latex builds do not work because of table formats that are not +supported in the latex generation. + +Usage +----- +python make.py clean +python make.py html +""" + +import glob +import os +import shutil +import sys +import sphinx + +os.environ['PYTHONPATH'] = '..' + +SPHINX_BUILD = 'sphinxbuild' + +def upload(): + 'push a copy to the site' + os.system('cd build/html; rsync -avz . pandas@pandas.pydata.org' + ':/usr/share/nginx/pandas/pandas-docs/vbench/ -essh') + +def clean(): + if os.path.exists('build'): + shutil.rmtree('build') + + if os.path.exists('source/generated'): + shutil.rmtree('source/generated') + +def html(): + check_build() + if os.system('sphinx-build -P -b html -d build/doctrees ' + 'source build/html'): + raise SystemExit("Building HTML failed.") + +def check_build(): + build_dirs = [ + 'build', 'build/doctrees', 'build/html', + 'build/plots', 'build/_static', + 'build/_templates'] + for d in build_dirs: + try: + os.mkdir(d) + except OSError: + pass + +def all(): + clean() + html() + +def auto_update(): + msg = '' + try: + clean() + html() + upload() + sendmail() + except (Exception, SystemExit), inst: + msg += str(inst) + '\n' + sendmail(msg) + +def sendmail(err_msg=None): + from_name, to_name = _get_config() + + if err_msg is None: + msgstr = 'Daily vbench uploaded successfully' + subject = "VB: daily update successful" + else: + msgstr = err_msg + subject = "VB: daily update failed" + + import smtplib + from email.MIMEText import MIMEText + msg = MIMEText(msgstr) + msg['Subject'] = subject + msg['From'] = from_name + msg['To'] = to_name + + server_str, port, login, pwd = _get_credentials() + server = smtplib.SMTP(server_str, port) + server.ehlo() + server.starttls() + server.ehlo() + + server.login(login, pwd) + try: + server.sendmail(from_name, to_name, msg.as_string()) + finally: + server.close() + +def _get_dir(): + import getpass + USERNAME = getpass.getuser() + if sys.platform == 'darwin': + HOME = '/Users/%s' % USERNAME + else: + HOME = '/home/%s' % USERNAME + + tmp_dir = '%s/tmp' % HOME + return tmp_dir + +def _get_credentials(): + tmp_dir = _get_dir() + cred = '%s/credentials' % tmp_dir + with open(cred, 'r') as fh: + server, port, un, domain = fh.read().split(',') + port = int(port) + login = un + '@' + domain + '.com' + + import base64 + with open('%s/cron_email_pwd' % tmp_dir, 'r') as fh: + pwd = base64.b64decode(fh.read()) + + return server, port, login, pwd + +def _get_config(): + tmp_dir = _get_dir() + with open('%s/config' % tmp_dir, 'r') as fh: + from_name, to_name = fh.read().split(',') + return from_name, to_name + +funcd = { + 'html' : html, + 'clean' : clean, + 'upload' : upload, + 'auto_update' : auto_update, + 'all' : all, + } + +small_docs = False + +# current_dir = os.getcwd() +# os.chdir(os.path.dirname(os.path.join(current_dir, __file__))) + +if len(sys.argv)>1: + for arg in sys.argv[1:]: + func = funcd.get(arg) + if func is None: + raise SystemExit('Do not know how to handle %s; valid args are %s'%( + arg, funcd.keys())) + func() +else: + small_docs = False + all() +#os.chdir(current_dir) diff --git a/vb_suite/miscellaneous.py b/vb_suite/miscellaneous.py new file mode 100644 index 00000000..eeeaf01a --- /dev/null +++ b/vb_suite/miscellaneous.py @@ -0,0 +1,34 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# cache_readonly + +setup = common_setup + """ +from pandas.util.decorators import cache_readonly + +class Foo: + + @cache_readonly + def prop(self): + return 5 +obj = Foo() +""" +misc_cache_readonly = Benchmark("obj.prop", setup, name="misc_cache_readonly", + ncalls=2000000) + +#---------------------------------------------------------------------- +# match + +setup = common_setup + """ +from pandas.util.testing import rands + +uniques = np.array([rands(10) for _ in xrange(1000)], dtype='O') +all = uniques.repeat(10) +""" + +match_strings = Benchmark("match(all, uniques)", setup, + start_date=datetime(2012, 5, 12)) diff --git a/vb_suite/pandas_vb_common.py b/vb_suite/pandas_vb_common.py new file mode 100644 index 00000000..6009f294 --- /dev/null +++ b/vb_suite/pandas_vb_common.py @@ -0,0 +1,18 @@ +from pandas import * +from pandas.util.testing import rands +from datetime import timedelta +from numpy.random import randn +import pandas.util.testing as tm +import random +import numpy as np + +try: + Panel = WidePanel +except Exception: + pass + +# didn't add to namespace until later +try: + from pandas.core.index import MultiIndex +except ImportError: + pass diff --git a/vb_suite/panel_ctor.py b/vb_suite/panel_ctor.py new file mode 100644 index 00000000..07d7326e --- /dev/null +++ b/vb_suite/panel_ctor.py @@ -0,0 +1,74 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# Panel.from_dict homogenization time + +START_DATE = datetime(2011, 6, 1) + +setup_same_index = common_setup + """ +# create 100 dataframes with the same index +dr = np.asarray(DateRange(datetime(1990,1,1), datetime(2012,1,1))) +data_frames = {} +for x in xrange(100): + df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), + "c": [2]*len(dr)}, index=dr) + data_frames[x] = df +""" + +panel_from_dict_same_index = \ + Benchmark("Panel.from_dict(data_frames)", + setup_same_index, name='panel_from_dict_same_index', + start_date=START_DATE, repeat=1, logy=True) + +setup_equiv_indexes = common_setup + """ +data_frames = {} +for x in xrange(100): + dr = np.asarray(DateRange(datetime(1990,1,1), datetime(2012,1,1))) + df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), + "c": [2]*len(dr)}, index=dr) + data_frames[x] = df +""" + +panel_from_dict_equiv_indexes = \ + Benchmark("Panel.from_dict(data_frames)", + setup_equiv_indexes, name='panel_from_dict_equiv_indexes', + start_date=START_DATE, repeat=1, logy=True) + +setup_all_different_indexes = common_setup + """ +data_frames = {} +start = datetime(1990,1,1) +end = datetime(2012,1,1) +for x in xrange(100): + end += timedelta(days=1) + dr = np.asarray(DateRange(start, end)) + df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), + "c": [2]*len(dr)}, index=dr) + data_frames[x] = df +""" +panel_from_dict_all_different_indexes = \ + Benchmark("Panel.from_dict(data_frames)", + setup_all_different_indexes, + name='panel_from_dict_all_different_indexes', + start_date=START_DATE, repeat=1, logy=True) + +setup_two_different_indexes = common_setup + """ +data_frames = {} +start = datetime(1990,1,1) +end = datetime(2012,1,1) +for x in xrange(100): + if x == 50: + end += timedelta(days=1) + dr = np.asarray(DateRange(start, end)) + df = DataFrame({"a": [0]*len(dr), "b": [1]*len(dr), + "c": [2]*len(dr)}, index=dr) + data_frames[x] = df +""" +panel_from_dict_two_different_indexes = \ + Benchmark("Panel.from_dict(data_frames)", + setup_two_different_indexes, + name='panel_from_dict_two_different_indexes', + start_date=START_DATE, repeat=1, logy=True) diff --git a/vb_suite/parser.py b/vb_suite/parser.py new file mode 100644 index 00000000..946e1327 --- /dev/null +++ b/vb_suite/parser.py @@ -0,0 +1,91 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +from pandas import read_csv +import os +N = 10000 +K = 8 +df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))) +df.to_csv('test.csv', sep='|') +""" + +read_csv_vb = Benchmark("read_csv('test.csv', sep='|')", setup, + cleanup="os.remove('test.csv')", + start_date=datetime(2012, 5, 7)) + + +setup = common_setup + """ +from pandas import read_csv +import os +N = 10000 +K = 8 +format = lambda x: '{:,}'.format(x) +df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))) +df = df.applymap(format) +df.to_csv('test.csv', sep='|') +""" + +read_csv_thou_vb = Benchmark("read_csv('test.csv', sep='|', thousands=',')", + setup, + cleanup="os.remove('test.csv')", + start_date=datetime(2012, 5, 7)) + +setup = common_setup + """ +from pandas import read_csv +import os +N = 10000 +K = 8 +format = lambda x: '%f' % x +df = DataFrame(np.random.randn(N, K) * np.random.randint(100, 10000, (N, K))) +df = df.applymap(format) +df.ix[:5, 0] = '#' +df.to_csv('test.csv', sep='|') +""" + +read_csv_comment_vb = Benchmark("read_csv('test.csv', sep='|', comment='#')", + setup, + cleanup="os.remove('test.csv')", + start_date=datetime(2012, 5, 7)) + +setup = common_setup + """ +from pandas import read_table +from cStringIO import StringIO +import os +N = 10000 +K = 8 +data = '''\ +KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' +data = data * 2000 +""" +cmd = ("read_table(StringIO(data), sep=',', header=None, " + "parse_dates=[[1,2], [1,3]])") +sdate = datetime(2012, 5, 7) +read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate) + +setup = common_setup + """ +from pandas import read_table +from cStringIO import StringIO +import os +N = 10000 +K = 8 +data = '''\ +KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 +KORD,19990127 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 +KORD,19990127 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 +KORD,19990127 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 +KORD,19990127 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 +''' +data = data * 2000 +""" +cmd = "read_table(StringIO(data), sep=',', header=None, parse_dates=[1])" +sdate = datetime(2012, 5, 7) +read_table_multiple_date_baseline = Benchmark(cmd, setup, start_date=sdate) diff --git a/vb_suite/reindex.py b/vb_suite/reindex.py new file mode 100644 index 00000000..589ba166 --- /dev/null +++ b/vb_suite/reindex.py @@ -0,0 +1,179 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# DataFrame reindex columns + +setup = common_setup + """ +df = DataFrame(index=range(10000), data=np.random.rand(10000,30), + columns=range(30)) +""" +statement = "df.reindex(columns=df.columns[1:5])" + +reindex_frame_columns = Benchmark(statement, setup, + name='dataframe_reindex_columns') + +#---------------------------------------------------------------------- + +setup = common_setup + """ +rng = DateRange('1/1/1970', periods=10000, offset=datetools.Minute()) +df = DataFrame(np.random.rand(10000, 10), index=rng, + columns=range(10)) +df['foo'] = 'bar' +rng2 = Index(rng[::2]) +""" +statement = "df.reindex(rng2)" +reindex_frame_daterange = Benchmark(statement, setup, + name='dataframe_reindex_daterange') + +#---------------------------------------------------------------------- +# multiindex reindexing + +setup = common_setup + """ +N = 1000 +K = 20 + +level1 = np.array([tm.rands(10) for _ in xrange(N)], dtype='O').repeat(K) +level2 = np.tile(np.array([tm.rands(10) for _ in xrange(K)], dtype='O'), + N) +index = MultiIndex.from_arrays([level1, level2]) + +s1 = Series(np.random.randn(N * K), index=index) +s2 = s1[::2] +""" +statement = "s1.reindex(s2.index)" +reindex_multi = Benchmark(statement, setup, + name='reindex_multiindex', + start_date=datetime(2011, 9, 1)) + +#---------------------------------------------------------------------- +# Pad / backfill + +setup = common_setup + """ +rng = DateRange('1/1/2000', periods=10000, offset=datetools.Minute()) + +ts = Series(np.random.randn(len(rng)), index=rng) +ts2 = ts[::2] +ts3 = ts2.reindex(ts.index) + +def pad(): + try: + ts2.reindex(ts.index, method='pad') + except: + ts2.reindex(ts.index, fillMethod='pad') +def backfill(): + try: + ts2.reindex(ts.index, method='backfill') + except: + ts2.reindex(ts.index, fillMethod='backfill') +""" + +statement = "pad()" +reindex_daterange_pad = Benchmark(statement, setup, + name="reindex_daterange_pad") + +statement = "backfill()" +reindex_daterange_backfill = Benchmark(statement, setup, + name="reindex_daterange_backfill") + +reindex_fillna_pad = Benchmark("ts3.fillna(method='pad')", setup, + name="reindex_fillna_pad", + start_date=datetime(2011, 3, 1)) + +reindex_fillna_backfill = Benchmark("ts3.fillna(method='backfill')", setup, + name="reindex_fillna_backfill", + start_date=datetime(2011, 3, 1)) + +#---------------------------------------------------------------------- +# align on level + +setup = common_setup + """ +index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) +random.shuffle(index.values) +df = DataFrame(np.random.randn(len(index), 4), index=index) +df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1]) +""" + +reindex_frame_level_align = \ + Benchmark("df.align(df_level, level=1, copy=False)", setup, + name='reindex_frame_level_align', + start_date=datetime(2011, 12, 27)) + +reindex_frame_level_reindex = \ + Benchmark("df_level.reindex(df.index, level=1)", setup, + name='reindex_frame_level_reindex', + start_date=datetime(2011, 12, 27)) + + +#---------------------------------------------------------------------- +# sort_index, drop_duplicates + +# pathological, but realistic +setup = common_setup + """ +import pandas._tseries as lib +N = 10000 +K = 10 + +key1 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K) +key2 = np.array([rands(10) for _ in xrange(N)], dtype='O').repeat(K) + +df = DataFrame({'key1' : key1, 'key2' : key2, + 'value' : np.random.randn(N * K)}) +""" +statement = "df.sort_index(by=['key1', 'key2'])" +frame_sort_index_by_columns = Benchmark(statement, setup, + name='frame_sort_index_by_columns', + start_date=datetime(2011, 11, 1)) + +# drop_duplicates + +statement = "df.drop_duplicates(['key1', 'key2'])" +frame_drop_duplicates = Benchmark(statement, setup, + name='frame_drop_duplicates', + start_date=datetime(2011, 11, 15)) + +statement = "df.drop_duplicates(['key1', 'key2'], inplace=True)" +frame_drop_dup_inplace = Benchmark(statement, setup, + name='frame_drop_dup_inplace', + start_date=datetime(2012, 5, 16)) + +lib_fast_zip = Benchmark('lib.fast_zip(df.values.T)', setup, + name='lib_fast_zip', + start_date=datetime(2012, 1, 1)) + +setup = setup + """ +df.ix[:10000, :] = np.nan +""" +statement2 = "df.drop_duplicates(['key1', 'key2'])" +frame_drop_duplicates_na = Benchmark(statement2, setup, + name='frame_drop_duplicates_na', + start_date=datetime(2012, 5, 15)) + +lib_fast_zip_fillna = Benchmark('lib.fast_zip_fillna(df.values.T)', setup, + name='lib_fast_zip_fillna', + start_date=datetime(2012, 5, 15)) + +statement2 = "df.drop_duplicates(['key1', 'key2'], inplace=True)" +frame_drop_dup_na_inplace = Benchmark(statement2, setup, + name='frame_drop_dup_na_inplace', + start_date=datetime(2012, 5, 16)) + +#---------------------------------------------------------------------- +# fillna, many columns + + +setup = common_setup + """ +values = np.random.randn(1000, 1000) +values[::2] = np.nan +df = DataFrame(values) +""" + +frame_fillna_many_columns_pad = Benchmark("df.fillna(method='pad')", + setup, + start_date=datetime(2011, 3, 1)) diff --git a/vb_suite/replace.py b/vb_suite/replace.py new file mode 100644 index 00000000..263ebcd6 --- /dev/null +++ b/vb_suite/replace.py @@ -0,0 +1,28 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +from datetime import timedelta +import pandas._tseries as lib +N = 1000000 + +try: + rng = date_range('1/1/2000', periods=N, freq='min') +except NameError: + rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute()) + date_range = DateRange + +ts = Series(np.random.randn(N), index=rng) + +def replace_slow(ser, old, new): + lib.slow_replace(ser.values, old, new) + return ser +""" + +replace_fillna = Benchmark('ts.fillna(0., inplace=True)', common_setup, + start_date=datetime(2012, 4, 4)) +replace_replacena = Benchmark('ts.replace(np.nan, 0., inplace=True)', + common_setup, + start_date=datetime(2012, 5, 15)) +replace_putmask = Benchmark('replace_slow(ts, np.nan, 0.)', common_setup, + start_date=datetime(2012, 5, 15)) diff --git a/vb_suite/reshape.py b/vb_suite/reshape.py new file mode 100644 index 00000000..6212d11d --- /dev/null +++ b/vb_suite/reshape.py @@ -0,0 +1,34 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +index = MultiIndex.from_arrays([np.arange(100).repeat(100), + np.roll(np.tile(np.arange(100), 100), 25)]) +df = DataFrame(np.random.randn(10000, 4), index=index) +""" + +reshape_unstack_simple = Benchmark('df.unstack(1)', common_setup, + start_date=datetime(2011, 10, 1)) + +setup = common_setup + """ +udf = df.unstack(1) +""" + +reshape_stack_simple = Benchmark('udf.stack()', setup, + start_date=datetime(2011, 10, 1)) + +setup = common_setup + """ +def unpivot(frame): + N, K = frame.shape + data = {'value' : frame.values.ravel('F'), + 'variable' : np.asarray(frame.columns).repeat(N), + 'date' : np.tile(np.asarray(frame.index), K)} + return DataFrame(data, columns=['date', 'variable', 'value']) +index = date_range('1/1/2000', periods=10000, freq='h') +df = DataFrame(randn(10000, 50), index=index, columns=range(50)) +pdf = unpivot(df) +f = lambda: pdf.pivot('date', 'variable', 'value') +""" + +reshape_pivot_time_series = Benchmark('f()', setup, + start_date=datetime(2012, 5, 1)) diff --git a/vb_suite/run_suite.py b/vb_suite/run_suite.py new file mode 100644 index 00000000..febd9d1f --- /dev/null +++ b/vb_suite/run_suite.py @@ -0,0 +1,13 @@ +from vbench.api import BenchmarkRunner +from suite import * + +def run_process(): + runner = BenchmarkRunner(benchmarks, REPO_PATH, REPO_URL, + BUILD, DB_PATH, TMP_DIR, PREPARE, + always_clean=True, + run_option='eod', start_date=START_DATE, + module_dependencies=dependencies) + runner.run() + +if __name__ == '__main__': + run_process() diff --git a/vb_suite/source/_static/stub b/vb_suite/source/_static/stub new file mode 100644 index 00000000..e69de29b diff --git a/vb_suite/source/conf.py b/vb_suite/source/conf.py new file mode 100644 index 00000000..35a89ba8 --- /dev/null +++ b/vb_suite/source/conf.py @@ -0,0 +1,224 @@ +# -*- coding: utf-8 -*- +# +# pandas documentation build configuration file, created by +# +# This file is execfile()d with the current directory set to its containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys, os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.append(os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('../sphinxext')) + +sys.path.extend([ + + # numpy standard doc extensions + os.path.join(os.path.dirname(__file__), + '..', '../..', + 'sphinxext') + +]) + +# -- General configuration ----------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be extensions +# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. sphinxext. + +extensions = ['sphinx.ext.autodoc', + 'sphinx.ext.doctest'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates', '_templates/autosummary'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pandas' +copyright = u'2008-2011, the pandas development team' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import pandas + +# version = '%s r%s' % (pandas.__version__, svn_version()) +version = '%s' % (pandas.__version__) + +# The full version, including alpha/beta/rc tags. +release = version + +# JP: added from sphinxdocs +autosummary_generate = True + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +#language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of documents that shouldn't be included in the build. +#unused_docs = [] + +# List of directories, relative to source directory, that shouldn't be searched +# for source files. +exclude_trees = [] + +# The reST default role (used for this markup: `text`) to use for all documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + + +# -- Options for HTML output --------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. Major themes that come with +# Sphinx are currently 'default' and 'sphinxdoc'. +html_theme = 'agogo' + +# The style sheet to use for HTML and HTML Help pages. A file of that name +# must exist either in Sphinx' static/ path, or in one of the custom paths +# given in html_static_path. +#html_style = 'statsmodels.css' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = ['themes'] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +html_title = 'Vbench performance benchmarks for pandas' + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +html_use_modindex = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = '' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'performance' + + +# -- Options for LaTeX output -------------------------------------------------- + +# The paper size ('letter' or 'a4'). +#latex_paper_size = 'letter' + +# The font size ('10pt', '11pt' or '12pt'). +#latex_font_size = '10pt' + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass [howto/manual]). +latex_documents = [ + ('index', 'performance.tex', + u'pandas vbench Performance Benchmarks', + u'Wes McKinney', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# Additional stuff for the LaTeX preamble. +#latex_preamble = '' + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_use_modindex = True + + +# Example configuration for intersphinx: refer to the Python standard library. +# intersphinx_mapping = {'http://docs.scipy.org/': None} +import glob +autosummary_generate = glob.glob("*.rst") diff --git a/vb_suite/source/themes/agogo/layout.html b/vb_suite/source/themes/agogo/layout.html new file mode 100644 index 00000000..cd0f3d7f --- /dev/null +++ b/vb_suite/source/themes/agogo/layout.html @@ -0,0 +1,95 @@ +{# + agogo/layout.html + ~~~~~~~~~~~~~~~~~ + + Sphinx layout template for the agogo theme, originally written + by Andi Albrecht. + + :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + :license: BSD, see LICENSE for details. +#} +{% extends "basic/layout.html" %} + +{% block header %} +
+
+ {%- if logo %} + + {%- endif %} + {%- block headertitle %} +

{{ shorttitle|e }}

+ {%- endblock %} +
+ {%- for rellink in rellinks|reverse %} + {{ rellink[3] }} + {%- if not loop.last %}{{ reldelim2 }}{% endif %} + {%- endfor %} +
+
+
+{% endblock %} + +{% block content %} +
+
+ +
+ {%- block document %} + {{ super() }} + {%- endblock %} +
+
+
+
+{% endblock %} + +{% block footer %} + +{% endblock %} + +{% block relbar1 %}{% endblock %} +{% block relbar2 %}{% endblock %} diff --git a/vb_suite/source/themes/agogo/static/agogo.css_t b/vb_suite/source/themes/agogo/static/agogo.css_t new file mode 100644 index 00000000..ef909b72 --- /dev/null +++ b/vb_suite/source/themes/agogo/static/agogo.css_t @@ -0,0 +1,476 @@ +/* + * agogo.css_t + * ~~~~~~~~~~~ + * + * Sphinx stylesheet -- agogo theme. + * + * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. + * :license: BSD, see LICENSE for details. + * + */ + +* { + margin: 0px; + padding: 0px; +} + +body { + font-family: {{ theme_bodyfont }}; + line-height: 1.4em; + color: black; + background-color: {{ theme_bgcolor }}; +} + + +/* Page layout */ + +div.header, div.content, div.footer { + max-width: {{ theme_pagewidth }}; + margin-left: auto; + margin-right: auto; +} + +div.header-wrapper { + background: {{ theme_headerbg }}; + padding: 1em 1em 0; + border-bottom: 3px solid #2e3436; + min-height: 0px; +} + + +/* Default body styles */ +a { + color: {{ theme_linkcolor }}; +} + +div.bodywrapper a, div.footer a { + text-decoration: underline; +} + +.clearer { + clear: both; +} + +.left { + float: left; +} + +.right { + float: right; +} + +.line-block { + display: block; + margin-top: 1em; + margin-bottom: 1em; +} + +.line-block .line-block { + margin-top: 0; + margin-bottom: 0; + margin-left: 1.5em; +} + +h1, h2, h3, h4 { + font-family: {{ theme_headerfont }}; + font-weight: normal; + color: {{ theme_headercolor2 }}; + margin-bottom: .8em; +} + +h1 { + color: {{ theme_headercolor1 }}; +} + +h2 { + padding-bottom: .5em; + border-bottom: 1px solid {{ theme_headercolor2 }}; +} + +a.headerlink { + visibility: hidden; + color: #dddddd; + padding-left: .3em; +} + +h1:hover > a.headerlink, +h2:hover > a.headerlink, +h3:hover > a.headerlink, +h4:hover > a.headerlink, +h5:hover > a.headerlink, +h6:hover > a.headerlink, +dt:hover > a.headerlink { + visibility: visible; +} + +img { + border: 0; +} + +pre { + background-color: #EEE; + padding: 0.5em; +} + +div.admonition { + margin-top: 10px; + margin-bottom: 10px; + padding: 2px 7px 1px 7px; + border-left: 0.2em solid black; +} + +p.admonition-title { + margin: 0px 10px 5px 0px; + font-weight: bold; +} + +dt:target, .highlighted { + background-color: #fbe54e; +} + +/* Header */ + +/* +div.header { + padding-top: 10px; + padding-bottom: 10px; +} +*/ + +div.header {} + +div.header h1 { + font-family: {{ theme_headerfont }}; + font-weight: normal; + font-size: 180%; + letter-spacing: .08em; +} + +div.header h1 a { + color: white; +} + +div.header div.rel { + text-decoration: none; +} +/* margin-top: 1em; */ + +div.header div.rel a { + margin-top: 1em; + color: {{ theme_headerlinkcolor }}; + letter-spacing: .1em; + text-transform: uppercase; + padding: 3px 1em; +} + +p.logo { + float: right; +} + +img.logo { + border: 0; +} + + +/* Content */ +div.content-wrapper { + background-color: white; + padding: 1em; +} +/* + padding-top: 20px; + padding-bottom: 20px; +*/ + +/* float: left; */ + +div.document { + max-width: {{ theme_documentwidth }}; +} + +div.body { + padding-right: 2em; + text-align: {{ theme_textalign }}; +} + +div.document ul { + margin: 1.5em; + list-style-type: square; +} + +div.document dd { + margin-left: 1.2em; + margin-top: .4em; + margin-bottom: 1em; +} + +div.document .section { + margin-top: 1.7em; +} +div.document .section:first-child { + margin-top: 0px; +} + +div.document div.highlight { + padding: 3px; + background-color: #eeeeec; + border-top: 2px solid #dddddd; + border-bottom: 2px solid #dddddd; + margin-top: .8em; + margin-bottom: .8em; +} + +div.document h2 { + margin-top: .7em; +} + +div.document p { + margin-bottom: .5em; +} + +div.document li.toctree-l1 { + margin-bottom: 1em; +} + +div.document .descname { + font-weight: bold; +} + +div.document .docutils.literal { + background-color: #eeeeec; + padding: 1px; +} + +div.document .docutils.xref.literal { + background-color: transparent; + padding: 0px; +} + +div.document blockquote { + margin: 1em; +} + +div.document ol { + margin: 1.5em; +} + + +/* Sidebar */ + + +div.sidebar { + width: {{ theme_sidebarwidth }}; + padding: 0 1em; + float: right; + font-size: .93em; +} + +div.sidebar a, div.header a { + text-decoration: none; +} + +div.sidebar a:hover, div.header a:hover { + text-decoration: underline; +} + +div.sidebar h3 { + color: #2e3436; + text-transform: uppercase; + font-size: 130%; + letter-spacing: .1em; +} + +div.sidebar ul { + list-style-type: none; +} + +div.sidebar li.toctree-l1 a { + display: block; + padding: 1px; + border: 1px solid #dddddd; + background-color: #eeeeec; + margin-bottom: .4em; + padding-left: 3px; + color: #2e3436; +} + +div.sidebar li.toctree-l2 a { + background-color: transparent; + border: none; + margin-left: 1em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l3 a { + background-color: transparent; + border: none; + margin-left: 2em; + border-bottom: 1px solid #dddddd; +} + +div.sidebar li.toctree-l2:last-child a { + border-bottom: none; +} + +div.sidebar li.toctree-l1.current a { + border-right: 5px solid {{ theme_headerlinkcolor }}; +} + +div.sidebar li.toctree-l1.current li.toctree-l2 a { + border-right: none; +} + + +/* Footer */ + +div.footer-wrapper { + background: {{ theme_footerbg }}; + border-top: 4px solid #babdb6; + padding-top: 10px; + padding-bottom: 10px; + min-height: 80px; +} + +div.footer, div.footer a { + color: #888a85; +} + +div.footer .right { + text-align: right; +} + +div.footer .left { + text-transform: uppercase; +} + + +/* Styles copied from basic theme */ + +img.align-left, .figure.align-left, object.align-left { + clear: left; + float: left; + margin-right: 1em; +} + +img.align-right, .figure.align-right, object.align-right { + clear: right; + float: right; + margin-left: 1em; +} + +img.align-center, .figure.align-center, object.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +.align-left { + text-align: left; +} + +.align-center { + clear: both; + text-align: center; +} + +.align-right { + text-align: right; +} + +/* -- search page ----------------------------------------------------------- */ + +ul.search { + margin: 10px 0 0 20px; + padding: 0; +} + +ul.search li { + padding: 5px 0 5px 20px; + background-image: url(file.png); + background-repeat: no-repeat; + background-position: 0 7px; +} + +ul.search li a { + font-weight: bold; +} + +ul.search li div.context { + color: #888; + margin: 2px 0 0 30px; + text-align: left; +} + +ul.keywordmatches li.goodmatch a { + font-weight: bold; +} + +/* -- index page ------------------------------------------------------------ */ + +table.contentstable { + width: 90%; +} + +table.contentstable p.biglink { + line-height: 150%; +} + +a.biglink { + font-size: 1.3em; +} + +span.linkdescr { + font-style: italic; + padding-top: 5px; + font-size: 90%; +} + +/* -- general index --------------------------------------------------------- */ + +table.indextable td { + text-align: left; + vertical-align: top; +} + +table.indextable dl, table.indextable dd { + margin-top: 0; + margin-bottom: 0; +} + +table.indextable tr.pcap { + height: 10px; +} + +table.indextable tr.cap { + margin-top: 10px; + background-color: #f2f2f2; +} + +img.toggler { + margin-right: 3px; + margin-top: 3px; + cursor: pointer; +} + +/* -- viewcode extension ---------------------------------------------------- */ + +.viewcode-link { + float: right; +} + +.viewcode-back { + float: right; + font-family:: {{ theme_bodyfont }}; +} + +div.viewcode-block:target { + margin: -1px -3px; + padding: 0 3px; + background-color: #f4debf; + border-top: 1px solid #ac9; + border-bottom: 1px solid #ac9; +} + +th.field-name { + white-space: nowrap; +} diff --git a/vb_suite/source/themes/agogo/static/bgfooter.png b/vb_suite/source/themes/agogo/static/bgfooter.png new file mode 100644 index 0000000000000000000000000000000000000000..9ce5bdd902943fdf8b0c0ca6a545297e1e2cc665 GIT binary patch literal 434 zcmV;j0ZsmiP)Px#24YJ`L;%wO*8tD73qoQ5000SaNLh0L01FcU01FcV0GgZ_00007bV*G`2iXD> z2Q(2CT#42I000?uMObu0Z*6U5Zgc=ca%Ew3Wn>_CX>@2HM@dakSAh-}0003ENklR?sq9~H`=l5UI-{JW_f9!)=Hwush3JC}Y z1gFM&r>$lJNPt^*1k!w;l|obx>lr$2IOaI$n=(gBBaj^I0=y%@K5N&GIU&-%OE_~V zX=m=_j7d`hvubQRuF+xT63vIfWnC3%kKN*T3l7ob3nEC2R->wU1Y)4)(7_t^thiqb zj$CO7xBn9gg`*!MY$}SI|_*)!a*&V0w7h>cUb&$Grh37iJ=C%Yn c>}w1E0Z4f>1OEiDlmGw#07*qoM6N<$g4BwtIsgCw literal 0 HcmV?d00001 diff --git a/vb_suite/source/themes/agogo/static/bgtop.png b/vb_suite/source/themes/agogo/static/bgtop.png new file mode 100644 index 0000000000000000000000000000000000000000..a0d4709bac8f79943a817195c086461c8c4d5419 GIT binary patch literal 430 zcmV;f0a5;mP)Px#24YJ`L;zI)R{&FzA;Z4_000SaNLh0L01FcU01FcV0GgZ_00007bV*G`2iXD> z2Q3AZhV-)l000?uMObu0Z*6U5Zgc=ca%Ew3Wn>_CX>@2HM@dakSAh-}0003ANklMo8vqN`cM=KwSQV|n zk}naE+VzlN;kK@Ej${PSkI$-R6-Yfp`zA;^O$`)7`gRi{-0i?owGIbX{p>Nc##93U z;sA|ayOYkG%F9M0iEMUM*s3NDYSS=KN2ht8Rv|7nv77i{NTO47R)}V_+2H~mL-nTR z_8j}*%6Qm8?#7NU2kM$#gcP&kO?iw|n}ynz+r-~FA9nKcZnfixWvZ&d28Cc_6&_Pe zMpbjI>9r+<=}NIDz4mCd3U++H?rrHcYxH&eeB|)>mnv*N#44ILM2zL6yU!VVWSrgp Y0Yu&#qm)=by8r+H07*qoM6N<$f@HC)j{pDw literal 0 HcmV?d00001 diff --git a/vb_suite/source/themes/agogo/theme.conf b/vb_suite/source/themes/agogo/theme.conf new file mode 100644 index 00000000..3fc88580 --- /dev/null +++ b/vb_suite/source/themes/agogo/theme.conf @@ -0,0 +1,19 @@ +[theme] +inherit = basic +stylesheet = agogo.css +pygments_style = tango + +[options] +bodyfont = "Verdana", Arial, sans-serif +headerfont = "Georgia", "Times New Roman", serif +pagewidth = 70em +documentwidth = 50em +sidebarwidth = 20em +bgcolor = #eeeeec +headerbg = url(bgtop.png) top left repeat-x +footerbg = url(bgfooter.png) top left repeat-x +linkcolor = #ce5c00 +headercolor1 = #204a87 +headercolor2 = #3465a4 +headerlinkcolor = #fcaf3e +textalign = justify \ No newline at end of file diff --git a/vb_suite/sparse.py b/vb_suite/sparse.py new file mode 100644 index 00000000..18cd71fb --- /dev/null +++ b/vb_suite/sparse.py @@ -0,0 +1,29 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- + +setup = common_setup + """ +from pandas.core.sparse import SparseSeries, SparseDataFrame + +K = 50 +N = 50000 +rng = np.asarray(DateRange('1/1/2000', periods=N, + offset=datetools.Minute())) + +# rng2 = np.asarray(rng).astype('M8[ns]').astype('i8') + +series = {} +for i in range(1, K + 1): + data = np.random.randn(N)[:-i] + this_rng = rng[:-i] + data[100:] = np.nan + series[i] = SparseSeries(data, index=this_rng) +""" +stmt = "SparseDataFrame(series)" + +bm_sparse1 = Benchmark(stmt, setup, name="sparse_series_to_frame", + start_date=datetime(2011, 6, 1)) diff --git a/vb_suite/stat_ops.py b/vb_suite/stat_ops.py new file mode 100644 index 00000000..03b47ef7 --- /dev/null +++ b/vb_suite/stat_ops.py @@ -0,0 +1,75 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# nanops + +setup = common_setup + """ +s = Series(np.random.randn(100000), index=np.arange(100000)) +s[::2] = np.nan +""" + +stat_ops_series_std = Benchmark("s.std()", setup) + +#---------------------------------------------------------------------- +# ops by level + +setup = common_setup + """ +index = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)], + labels=[np.arange(10).repeat(10000), + np.tile(np.arange(100).repeat(100), 10), + np.tile(np.tile(np.arange(100), 100), 10)]) +random.shuffle(index.values) +df = DataFrame(np.random.randn(len(index), 4), index=index) +df_level = DataFrame(np.random.randn(100, 4), index=index.levels[1]) +""" + +stat_ops_level_frame_sum = \ + Benchmark("df.sum(level=1)", setup, + start_date=datetime(2011, 11, 15)) + +stat_ops_level_frame_sum_multiple = \ + Benchmark("df.sum(level=[0, 1])", setup, repeat=1, + start_date=datetime(2011, 11, 15)) + +stat_ops_level_series_sum = \ + Benchmark("df[1].sum(level=1)", setup, + start_date=datetime(2011, 11, 15)) + +stat_ops_level_series_sum_multiple = \ + Benchmark("df[1].sum(level=[0, 1])", setup, repeat=1, + start_date=datetime(2011, 11, 15)) + +#---------------------------------------------------------------------- +# rank + +setup = common_setup + """ +values = np.concatenate([np.arange(100000), + np.random.randn(100000), + np.arange(100000)]) +s = Series(values) +""" + +stats_rank_average = Benchmark('s.rank()', setup, + start_date=datetime(2011, 12, 12)) + +setup = common_setup + """ +values = np.random.randint(0, 100000, size=200000) +s = Series(values) +""" + +stats_rank_average_int = Benchmark('s.rank()', setup, + start_date=datetime(2011, 12, 12)) + +setup = common_setup + """ +df = DataFrame(np.random.randn(5000, 50)) +""" + +stats_rank2d_axis1_average = Benchmark('df.rank(1)', setup, + start_date=datetime(2011, 12, 12)) + +stats_rank2d_axis0_average = Benchmark('df.rank()', setup, + start_date=datetime(2011, 12, 12)) diff --git a/vb_suite/suite.py b/vb_suite/suite.py new file mode 100644 index 00000000..0a7c4eb9 --- /dev/null +++ b/vb_suite/suite.py @@ -0,0 +1,153 @@ +from vbench.api import Benchmark, GitRepo +from datetime import datetime + +import os + +modules = ['attrs_caching', + 'binary_ops', + 'ctors', + 'frame_ctor', + 'frame_methods', + 'groupby', + 'index_object', + 'indexing', + 'io_bench', + 'join_merge', + 'miscellaneous', + 'panel_ctor', + 'parser', + 'reindex', + 'replace', + 'sparse', + 'reshape', + 'stat_ops', + 'timeseries'] + +by_module = {} +benchmarks = [] + +for modname in modules: + ref = __import__(modname) + by_module[modname] = [v for v in ref.__dict__.values() + if isinstance(v, Benchmark)] + benchmarks.extend(by_module[modname]) + +for bm in benchmarks: + assert(bm.name is not None) + +import getpass +import sys + +USERNAME = getpass.getuser() + +if sys.platform == 'darwin': + HOME = '/Users/%s' % USERNAME +else: + HOME = '/home/%s' % USERNAME + +try: + import ConfigParser + + config = ConfigParser.ConfigParser() + config.readfp(open(os.path.expanduser('~/.vbenchcfg'))) + + REPO_PATH = config.get('setup', 'repo_path') + REPO_URL = config.get('setup', 'repo_url') + DB_PATH = config.get('setup', 'db_path') + TMP_DIR = config.get('setup', 'tmp_dir') +except: + REPO_PATH = os.path.join(HOME, 'code/pandas') + REPO_URL = 'git@github.com:pydata/pandas.git' + DB_PATH = os.path.join(HOME, 'code/pandas/vb_suite/benchmarks.db') + TMP_DIR = os.path.join(HOME, 'tmp/vb_pandas') + +PREPARE = """ +python setup.py clean +""" +BUILD = """ +python setup.py build_ext --inplace +""" +dependencies = ['pandas_vb_common.py'] + +START_DATE = datetime(2010, 6, 1) + +repo = GitRepo(REPO_PATH) + +RST_BASE = 'source' + +# HACK! + +#timespan = [datetime(2011, 1, 1), datetime(2012, 1, 1)] + +def generate_rst_files(benchmarks): + import matplotlib as mpl + mpl.use('Agg') + import matplotlib.pyplot as plt + + vb_path = os.path.join(RST_BASE, 'vbench') + fig_base_path = os.path.join(vb_path, 'figures') + + if not os.path.exists(vb_path): + print 'creating %s' % vb_path + os.makedirs(vb_path) + + if not os.path.exists(fig_base_path): + print 'creating %s' % fig_base_path + os.makedirs(fig_base_path) + + for bmk in benchmarks: + print 'Generating rst file for %s' % bmk.name + rst_path = os.path.join(RST_BASE, 'vbench/%s.txt' % bmk.name) + + fig_full_path = os.path.join(fig_base_path, '%s.png' % bmk.name) + + # make the figure + plt.figure(figsize=(10, 6)) + ax = plt.gca() + bmk.plot(DB_PATH, ax=ax) + + start, end = ax.get_xlim() + + plt.xlim([start - 30, end + 30]) + plt.savefig(fig_full_path, bbox_inches='tight') + plt.close('all') + + fig_rel_path = 'vbench/figures/%s.png' % bmk.name + rst_text = bmk.to_rst(image_path=fig_rel_path) + with open(rst_path, 'w') as f: + f.write(rst_text) + + with open(os.path.join(RST_BASE, 'index.rst'), 'w') as f: + print >> f, """ +Performance Benchmarks +====================== + +These historical benchmark graphs were produced with `vbench +`__. + +The ``pandas_vb_common`` setup script can be found here_ + +.. _here: https://github.com/pydata/pandas/tree/master/vb_suite + +Produced on a machine with + + - Intel Core i7 950 processor + - (K)ubuntu Linux 12.10 + - Python 2.7.2 64-bit (Enthought Python Distribution 7.1-2) + - NumPy 1.6.1 + +.. toctree:: + :hidden: + :maxdepth: 3 +""" + for modname, mod_bmks in sorted(by_module.items()): + print >> f, ' vb_%s' % modname + modpath = os.path.join(RST_BASE, 'vb_%s.rst' % modname) + with open(modpath, 'w') as mh: + header = '%s\n%s\n\n' % (modname, '=' * len(modname)) + print >> mh, header + + for bmk in mod_bmks: + print >> mh, bmk.name + print >> mh, '-' * len(bmk.name) + print >> mh, '.. include:: vbench/%s.txt\n' % bmk.name diff --git a/vb_suite/test.py b/vb_suite/test.py new file mode 100644 index 00000000..b565ea37 --- /dev/null +++ b/vb_suite/test.py @@ -0,0 +1,64 @@ +from pandas import * +import matplotlib.pyplot as plt + +import sqlite3 + +from vbench.git import GitRepo + + +REPO_PATH = '/home/adam/code/pandas' +repo = GitRepo(REPO_PATH) + +con = sqlite3.connect('vb_suite/benchmarks.db') + +bmk = '36900a889961162138c140ce4ae3c205' +# bmk = '9d7b8c04b532df6c2d55ef497039b0ce' +bmk = '4481aa4efa9926683002a673d2ed3dac' +bmk = '00593cd8c03d769669d7b46585161726' +bmk = '3725ab7cd0a0657d7ae70f171c877cea' +bmk = '3cd376d6d6ef802cdea49ac47a67be21' +bmk2 = '459225186023853494bc345fd180f395' +bmk = 'c22ca82e0cfba8dc42595103113c7da3' +bmk = 'e0e651a8e9fbf0270ab68137f8b9df5f' +bmk = '96bda4b9a60e17acf92a243580f2a0c3' + +def get_results(bmk): + results = con.execute("select * from results where checksum='%s'" % bmk).fetchall() + x = Series(dict((t[1], t[3]) for t in results)) + x.index = x.index.map(repo.timestamps.get) + x = x.sort_index() + return x + +x = get_results(bmk) + +def graph1(): + dm_getitem = get_results('459225186023853494bc345fd180f395') + dm_getvalue = get_results('c22ca82e0cfba8dc42595103113c7da3') + + plt.figure() + ax = plt.gca() + + dm_getitem.plot(label='df[col][idx]', ax=ax) + dm_getvalue.plot(label='df.get_value(idx, col)', ax=ax) + + plt.ylabel('ms') + plt.legend(loc='best') + +def graph2(): + bm = get_results('96bda4b9a60e17acf92a243580f2a0c3') + plt.figure() + ax = plt.gca() + + bm.plot(ax=ax) + plt.ylabel('ms') + +bm = get_results('36900a889961162138c140ce4ae3c205') +fig = plt.figure() +ax = plt.gca() +bm.plot(ax=ax) +fig.autofmt_xdate() + +plt.xlim([bm.dropna().index[0] - datetools.MonthEnd(), + bm.dropna().index[-1] + datetools.MonthEnd()]) +plt.ylabel('ms') + diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py new file mode 100644 index 00000000..853c473e --- /dev/null +++ b/vb_suite/timeseries.py @@ -0,0 +1,134 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +from datetime import timedelta +N = 100000 + +try: + rng = date_range('1/1/2000', periods=N, freq='min') +except NameError: + rng = DateRange('1/1/2000', periods=N, offset=datetools.Minute()) + def date_range(start=None, end=None, periods=None, freq=None): + return DateRange(start, end, periods=periods, offset=freq) + +if hasattr(Series, 'convert'): + Series.resample = Series.convert + +ts = Series(np.random.randn(N), index=rng) +""" + +#---------------------------------------------------------------------- +# Lookup value in large time series, hash map population + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=1500000, freq='s') +ts = Series(1, index=rng) +""" + +stmt = "ts[ts.index[len(ts) // 2]]; ts.index._cleanup()" +timeseries_large_lookup_value = Benchmark(stmt, setup, + start_date=datetime(2012, 1, 1)) + +#---------------------------------------------------------------------- +# Test slice minutely series + +timeseries_slice_minutely = Benchmark('ts[:10000]', common_setup) + +#---------------------------------------------------------------------- +# Test conversion + +setup = common_setup + """ + +""" + +timeseries_1min_5min_ohlc = Benchmark("ts[:10000].resample('5min', how='ohlc')", + common_setup, + start_date=datetime(2012, 5, 1)) + +timeseries_1min_5min_mean = Benchmark("ts[:10000].resample('5min', how='mean')", + common_setup, + start_date=datetime(2012, 5, 1)) + +#---------------------------------------------------------------------- +# Irregular alignment + +setup = common_setup + """ +lindex = np.random.permutation(N)[:N // 2] +rindex = np.random.permutation(N)[:N // 2] +left = Series(ts.values.take(lindex), index=ts.index.take(lindex)) +right = Series(ts.values.take(rindex), index=ts.index.take(rindex)) +""" + +timeseries_add_irregular = Benchmark('left + right', setup) + +#---------------------------------------------------------------------- +# Sort large irregular time series + +setup = common_setup + """ +N = 100000 +rng = date_range('1/1/2000', periods=N, freq='s') +rng = rng.take(np.random.permutation(N)) +ts = Series(np.random.randn(N), index=rng) +""" + +timeseries_sort_index = Benchmark('ts.sort_index()', setup, + start_date=datetime(2012, 4, 1)) + +#---------------------------------------------------------------------- +# Shifting, add offset + +setup = common_setup + """ +rng = date_range('1/1/2000', periods=10000, freq='T') +""" + +datetimeindex_add_offset = Benchmark('rng + timedelta(minutes=2)', setup, + start_date=datetime(2012, 4, 1)) + +setup = common_setup + """ +N = 10000 +rng = date_range('1/1/1990', periods=N, freq='53s') +ts = Series(np.random.randn(N), index=rng) +dates = date_range('1/1/1990', periods=N * 10, freq='5s') +""" +timeseries_asof_single = Benchmark('ts.asof(dates[0])', setup, + start_date=datetime(2012, 4, 27)) + +timeseries_asof = Benchmark('ts.asof(dates)', setup, + start_date=datetime(2012, 4, 27)) + +setup = setup + 'ts[250:5000] = np.nan' + +timeseries_asof_nan = Benchmark('ts.asof(dates)', setup, + start_date=datetime(2012, 4, 27)) + +#---------------------------------------------------------------------- +# Time zone stuff + +setup = common_setup + """ +rng = date_range('1/1/2000', '3/1/2000', tz='US/Eastern') +""" + +timeseries_timestamp_tzinfo_cons = \ + Benchmark('rng[0]', setup, start_date=datetime(2012, 5, 5)) + +#---------------------------------------------------------------------- +# Resampling period + +setup = common_setup + """ +rng = period_range('1/1/2000', '1/1/2001', freq='T') +ts = Series(np.random.randn(len(rng)), index=rng) +""" + +timeseries_period_downsample_mean = \ + Benchmark("ts.resample('D', how='mean')", setup, + start_date=datetime(2012, 4, 25)) + +setup = common_setup + """ +rng = date_range('1/1/2000', '1/1/2001', freq='T') +ts = Series(np.random.randn(len(rng)), index=rng) +""" + +timeseries_timestamp_downsample_mean = \ + Benchmark("ts.resample('D', how='mean')", setup, + start_date=datetime(2012, 4, 25)) -- 2.30.2